diff --git a/.github/workflows/e2e-suite.yaml b/.github/workflows/e2e-suite.yaml new file mode 100644 index 0000000..dce5b8a --- /dev/null +++ b/.github/workflows/e2e-suite.yaml @@ -0,0 +1,109 @@ +# Reusable e2e workflow (workflow_call): shared setup (build image, kind, deploy +# fluence base), then run ONE test suite — a directory under test/e2e/. The +# suite's tests are DISCOVERED (every NN-*.sh, run in sorted order); adding a test +# is just dropping a file in the directory, no workflow edit. If the suite needs +# special preparation it provides a setup.sh in its directory, which is run before +# the tests (the gang suite has none; the quantum suite installs the qpu add-on). +name: e2e-suite +on: + workflow_call: + inputs: + suite: + description: "test suite directory name under test/e2e/ (e.g. gang, quantum)" + required: true + type: string + +env: + IMAGE: vanessa/fluence:test + +jobs: + run: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build fluence image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: false + load: true + tags: ${{ env.IMAGE }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + version: v0.32.0 # required for gang + node_image: kindest/node:v1.36.1 + config: ./deploy/kind-config.yaml + + - name: Free Disk Space (Ubuntu) + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ + /opt/hostedtoolcache/CodeQL + sudo apt-get clean + df -h + + - name: Load docker images + run: | + cluster=$(kind get clusters) + kind load --name "$cluster" docker-image ${{ env.IMAGE }} + + - name: Deploy fluence (base) + run: | + kubectl apply -f deploy/fluence-test.yaml + kubectl rollout status -n kube-system deployment/fluence --timeout=180s + POD="" + for i in $(seq 1 60); do + POD=$(kubectl -n kube-system get pods -l app=fluence \ + -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) + [ -n "$POD" ] && break + sleep 2 + done + [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } + echo "Using pod: $POD" + sleep 5 + kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" || true + kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' + + # Per-suite special setup, if the suite directory provides one. + - name: Suite setup (${{ inputs.suite }}) + run: | + s="test/e2e/${{ inputs.suite }}/setup.sh" + if [ -f "$s" ]; then + echo "running $s" + bash "$s" + else + echo "no setup.sh for suite '${{ inputs.suite }}' — skipping" + fi + + # Discover and run every NN-*.sh in the suite directory, in sorted order. + - name: Run suite (${{ inputs.suite }}) + run: | + dir="test/e2e/${{ inputs.suite }}" + [ -d "$dir" ] || { echo "ERROR: no such suite dir: $dir"; exit 1; } + shopt -s nullglob + tests=("$dir"/[0-9]*.sh) + [ ${#tests[@]} -gt 0 ] || { echo "ERROR: no NN-*.sh tests in $dir"; exit 1; } + IFS=$'\n' tests=($(sort <<<"${tests[*]}")); unset IFS + echo "discovered ${#tests[@]} test(s) in $dir:" + printf ' %s\n' "${tests[@]}" + for t in "${tests[@]}"; do + echo "::group::$t" + bash "$t" + echo "::endgroup::" + done + + - name: Dump diagnostics on failure + if: failure() + run: | + kubectl get pods -A -o wide + kubectl logs -n kube-system deployment/fluence || true + kubectl logs -n kube-system deployment/fluence-webhook || true \ No newline at end of file diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index a6c1266..4b405f6 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -8,140 +8,15 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -env: - KIND_VERSION: v0.32.0 - IMAGE: vanessa/fluence:test - jobs: + # Fan out the suites as parallel jobs, each a call into the reusable workflow. + # The shared setup (build, kind, deploy) lives once in e2e-suite.yaml; the + # matrix runs gang and quantum concurrently. e2e: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build fluence image - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: false - load: true - tags: ${{ env.IMAGE }} - cache-from: type=gha - cache-to: type=gha,mode=max - - - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.10.0 - with: - version: v0.32.0 # required for gang - node_image: kindest/node:v1.36.1 - config: ./deploy/kind-config.yaml - - - name: Free Disk Space (Ubuntu) - run: | - echo "=== Disk space before cleanup ===" - df -h - - # Remove large software runtimes and tools - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - sudo rm -rf /opt/hostedtoolcache/CodeQL - - # Clean package caches - sudo apt-get clean - echo "=== Disk space after cleanup ===" - df -h - - - name: Load docker images - run: | - kind get clusters - cluster=$(kind get clusters) - kind load --name $cluster docker-image vanessa/fluence:test - - - name: Deploy fluence (base) - run: | - kubectl apply -f deploy/fluence-test.yaml - kubectl rollout status -n kube-system deployment/fluence --timeout=180s - # rollout status can return while the OLD ReplicaSet's pod is still - # Running (terminating). Selecting by phase=Running alone can grab that - # stale pod, which then 404s on exec/logs. Wait until exactly one - # fluence pod remains, and require it to be Ready and not terminating. - POD="" - for i in $(seq 1 60); do - # names of pods that are Ready AND have no deletionTimestamp (not terminating) - POD=$(kubectl -n kube-system get pods -l app=fluence \ - -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) - [ -n "$POD" ] && break - sleep 2 - done - [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } - echo "Using pod: $POD" - # Brief sleep to let the container runtime stabilize before exec - sleep 5 - kubectl -n kube-system exec "$POD" -- ls /tmp/ - kubectl -n kube-system logs "$POD" - kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' - - - name: E2E - classical gang - run: bash test/e2e/01-classical-gang.sh - - - name: Deploy quantum add-on - run: | - # Includes the device plugin and oriented to testing container - kubectl apply -f deploy/fluence-resources-test.yaml - kubectl rollout restart -n kube-system deployment/fluence - kubectl rollout status -n kube-system deployment/fluence --timeout=60s - for i in $(seq 1 60); do - kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' - kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break - sleep 1 - done - # After a rollout restart BOTH the old and new pods are briefly Running. - # Select only a Ready pod with no deletionTimestamp (i.e. the new one, - # not the terminating old one) so exec/logs don't 404. - POD="" - for i in $(seq 1 60); do - POD=$(kubectl -n kube-system get pods -l app=fluence \ - -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) - [ -n "$POD" ] && break - sleep 2 - done - [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found after restart"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } - echo "Using pod: $POD" - # Brief sleep to let the container runtime stabilize before exec - sleep 5 - kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - - - name: Wait for webhook - run: | - - # wait for the deployment AND for the caBundle to be populated on the webhook config - kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s - for i in $(seq 1 30); do - cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \ - -o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null) - [ -n "$cab" ] && break - sleep 2 - done - # let TLS serving settle after caBundle patch - sleep 3 - - - name: E2E - quantum placement - run: bash test/e2e/02-quantum-placement.sh - - #- name: E2E - restart recovery (no double-book) - # run: bash test/e2e/03-restart-recovery.sh - - - name: E2E - sidecar ungate - run: bash test/e2e/04-sidecar-ungate.sh - - - name: Dump diagnostics on failure - if: failure() - run: | - kubectl get pods -A -o wide - kubectl logs -n kube-system deployment/fluence + strategy: + fail-fast: false # one suite failing should not cancel the other + matrix: + suite: [gang, quantum] + uses: ./.github/workflows/e2e-suite.yaml + with: + suite: ${{ matrix.suite }} \ No newline at end of file diff --git a/Makefile b/Makefile index 1160cb4..5912c5a 100644 --- a/Makefile +++ b/Makefile @@ -55,13 +55,16 @@ test-image-deploy: test-image kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true kubectl delete deployments --all kubectl delete pods --all - kubectl delete -f deploy/fluence-test.yaml + kubectl delete -f deploy/fluence-test.yaml || true kubectl delete pods --all +.PHONY: test-deploy-recreate +test-deploy-recreate: test-image-deploy + kubectl apply -f deploy/fluence-pull-test.yaml .PHONY: deploy deploy: ## Install RBAC + scheduler into kube-system - kubectl apply -f deploy/fluence.yaml + kubectl apply -f deploy/fluence-.yaml .PHONY: help help: diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index ea2669a..1a6709d 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -12,9 +12,11 @@ package main import ( "context" "crypto/tls" + "flag" "log" "net/http" "os" + "strings" "time" "github.com/converged-computing/fluence/pkg/cluster" @@ -38,6 +40,29 @@ func main() { cfgName := env("WEBHOOK_CONFIG", "fluence-webhook") addr := env("WEBHOOK_ADDR", ":8443") + // Handler selection. By default ALL registered handlers are enabled. The + // operator may restrict the active set with --handlers (comma-separated) or + // the FLUENCE_HANDLERS env var, e.g. --handlers=fluxion,gang to run without + // quantum. An empty value means all enabled. Unknown names are warned about + // but not fatal (so config survives a handler being renamed/removed). + handlersFlag := flag.String("handlers", env("FLUENCE_HANDLERS", ""), + "comma-separated handlers in dispatch order (default: fluxion,quantum,gang). e.g. fluxion,gang disables quantum") + flag.Parse() + + var requested []string + if *handlersFlag != "" { + for _, n := range strings.Split(*handlersFlag, ",") { + if n = strings.TrimSpace(n); n != "" { + requested = append(requested, n) + } + } + } + active, unknown := webhook.SetActiveHandlers(requested) + for _, n := range unknown { + log.Printf("[fluence-webhook] WARNING: unknown handler %q — ignoring", n) + } + log.Printf("[fluence-webhook] active handlers (in dispatch order): %v", active) + dnsNames := []string{ svc + "." + ns + ".svc", svc + "." + ns + ".svc.cluster.local", @@ -87,7 +112,6 @@ func main() { mutator := &webhook.Mutator{ AttributeKeys: attrKeys, Clientset: client, - SidecarImage: env("FLUENCE_SIDECAR_IMAGE", ""), } log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames()) diff --git a/deploy/fluence-pull-test.yaml b/deploy/fluence-pull-test.yaml new file mode 100644 index 0000000..94c2425 --- /dev/null +++ b/deploy/fluence-pull-test.yaml @@ -0,0 +1,286 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluence + namespace: kube-system +--- +# Bind the built-in scheduler roles so fluence (a full kube-scheduler build) has +# every list/watch the scheduling framework needs (nodes, pods, PV/PVC, CSI, +# storageclasses, resourceclaims/slices, volumeattachments, events, etc.). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-kube-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-volume-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:volume-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Delegated authentication: read the auth configmap in kube-system. This is the +# fix for the "extension-apiserver-authentication ... forbidden" errors. +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluence-extension-apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Extras the built-in scheduler role does not grant: the alpha PodGroup/Workload +# API (gang), and leader-election leases under our scheduler name. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fluence-extra +rules: + - apiGroups: ["scheduling.k8s.io"] + resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create", "get", "update", "list", "watch"] + # PreBind stamps the allocated backend onto the pod as an annotation; the + # built-in system:kube-scheduler role only allows patching pods/status, not + # the pod object, so grant it here. + - apiGroups: [""] + resources: ["pods"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] + # The webhook self-manages its TLS by patching its own config's caBundle. + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["get", "list", "watch", "patch"] + # The webhook creates per-namespace sidecar RBAC on demand when a leader + # pod is admitted, so users do not need to apply RBAC manually. + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get", "create"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "create"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-extra +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fluence-extra +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-scheduler-config + namespace: kube-system +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: fluence + plugins: + # multiPoint wires Fluence into every extension point its Go type + # implements: PreFilter, Filter, and PreBind (which stamps the backend + # annotation). Listing points individually risks omitting one — that is + # exactly what left PreBind unwired and the backend annotation unset. + multiPoint: + enabled: [{name: Fluence}] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence + namespace: kube-system + labels: {app: fluence} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence} + template: + metadata: + labels: {app: fluence} + spec: + serviceAccountName: fluence + containers: + - name: fluence + image: vanessa/fluence:test + # Allows for kind load + imagePullPolicy: Always + command: + - /bin/fluence + - --config=/etc/fluence/scheduler-config.yaml + # fluence is its own scheduler binary, so it needs the gang gates set + # here (the cluster-level kube-scheduler gates don't apply to it). + # Without these its PodGroup/GangScheduling plugin is inactive, pods + # schedule with no gang semantics, and PodGroup status stays Pending. + - --feature-gates=GenericWorkload=true,GangScheduling=true + # Re-attempt unschedulable pods more often than the 5m default. In the + # contention experiment a gang that loses the initial race for nodes is + # marked Unschedulable; this is how soon it is re-tried after capacity + # frees (the event-driven QueueingHint is best-effort; this is the + # backstop that bounds worst-case requeue latency). 30s keeps contended + # gangs draining promptly without thrashing the queue. + - --pod-max-in-unschedulable-pods-duration=30s + - --v=4 + env: + # Path to the resources config (e.g. quantum backends). Unset/empty + # file -> classical-only graph. Supplied by the quantum add-on. + - name: FLUENCE_RESOURCES + value: /etc/fluence/resources.yaml + volumeMounts: + - name: config + mountPath: /etc/fluence + volumes: + - name: config + projected: + sources: + - configMap: {name: fluence-scheduler-config} + - configMap: {name: fluence-resources, optional: true} +--- +# Mutating webhook: injects scheduler-chosen values into pods at creation time +# (currently a downward-API QRMI_BACKEND env for quantum pods). It self-manages +# TLS — generates a CA + serving cert at startup and patches the caBundle below — +# so no cert-manager and no committed keys. failurePolicy Ignore keeps a webhook +# outage from blocking pod creation cluster-wide. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence-webhook + namespace: kube-system + labels: {app: fluence-webhook} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence-webhook} + template: + metadata: + labels: {app: fluence-webhook} + spec: + serviceAccountName: fluence + containers: + - name: webhook + image: vanessa/fluence:test + # Allows for kind load + imagePullPolicy: Always + command: ["/bin/fluence-webhook"] + env: + # Use busybox as sidecar image in tests — avoids pulling the real + # sidecar image which is large and not cached in CI. + - name: FLUENCE_SIDECAR_IMAGE + value: "busybox:latest" + ports: + - containerPort: 8443 + readinessProbe: + httpGet: {path: /healthz, port: 8443, scheme: HTTPS} + initialDelaySeconds: 2 +--- +apiVersion: v1 +kind: Service +metadata: + name: fluence-webhook + namespace: kube-system +spec: + selector: {app: fluence-webhook} + ports: + - port: 443 + targetPort: 8443 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: fluence-webhook +webhooks: + - name: pods.fluence.flux-framework.org + admissionReviewVersions: ["v1"] + sideEffects: None + failurePolicy: Ignore # never block pod creation if the webhook is down + # caBundle is filled in at runtime by the webhook patching this object. + clientConfig: + service: + name: fluence-webhook + namespace: kube-system + path: /mutate + port: 443 + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + scope: Namespaced + # Don't intercept system pods (and avoid bootstrap coupling). + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: NotIn + values: ["kube-system"] +--- +# fluence-sidecar.yaml +# +# RBAC and supporting resources for the Fluence quantum sidecar. +# +# The sidecar runs inside a leader pod and needs: +# - patch/annotate on pods in its own namespace (to ungate workers and +# propagate the task ARN annotation) +# +# The sidecar ServiceAccount is namespace-scoped — it only has permissions +# in the namespace where the workflow runs. The webhook sets +# spec.serviceAccountName on the leader pod to fluence-sidecar. +# +# The fluence Python package is staged into user containers by an init +# container (Model C): the webhook injects an init container from the +# sidecar image that copies the package + sitecustomize into a shared +# volume on the user container's PYTHONPATH. No ConfigMap, no user install. +# +# Apply with: +# kubectl apply -f deploy/fluence-sidecar.yaml + + +--- +# PriorityClass for classical pods paired with quantum work. +# Applied to worker pods by the webhook when they are gated. +# When ungated, high priority triggers preemption of lower-priority work +# so workers get nodes immediately as the QPU result arrives. +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: fluence-quantum-classical + labels: + app: fluence +value: 1000000 +globalDefault: false +preemptionPolicy: PreemptLowerPriority +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml index 6d1dace..ab61a91 100644 --- a/deploy/fluence-test.yaml +++ b/deploy/fluence-test.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] @@ -146,6 +148,13 @@ spec: # Without these its PodGroup/GangScheduling plugin is inactive, pods # schedule with no gang semantics, and PodGroup status stays Pending. - --feature-gates=GenericWorkload=true,GangScheduling=true + # Re-attempt unschedulable pods more often than the 5m default. In the + # contention experiment a gang that loses the initial race for nodes is + # marked Unschedulable; this is how soon it is re-tried after capacity + # frees (the event-driven QueueingHint is best-effort; this is the + # backstop that bounds worst-case requeue latency). 30s keeps contended + # gangs draining promptly without thrashing the queue. + - --pod-max-in-unschedulable-pods-duration=30s - --v=4 env: # Path to the resources config (e.g. quantum backends). Unset/empty diff --git a/deploy/fluence.yaml b/deploy/fluence.yaml index b856268..7d71386 100644 --- a/deploy/fluence.yaml +++ b/deploy/fluence.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] diff --git a/deploy/kind-config.yaml b/deploy/kind-config.yaml index c94e070..ec310bc 100644 --- a/deploy/kind-config.yaml +++ b/deploy/kind-config.yaml @@ -32,4 +32,4 @@ nodes: - name: feature-gates value: "GenericWorkload=true" - role: worker - - role: worker + - role: worker \ No newline at end of file diff --git a/docs/handlers.md b/docs/handlers.md new file mode 100644 index 0000000..1da169a --- /dev/null +++ b/docs/handlers.md @@ -0,0 +1,83 @@ +# Webhook handlers & sidecar architecture + +Fluence's value is not creating gangs (Kubernetes 1.36 native gang scheduling +already does that). It is **customizing the gang on the fly based on the +resources a pod requests** — e.g. a quantum leader/worker workload becomes a +size-1 leader gang plus a size-(N-1) worker gang, with the leader running a +sidecar that ungates its workers when the quantum task is ready. + +## Handlers + +Each handler is an interface implementation (`pkg/webhook/handler.go`): + +```go +type Handler interface { + Name() string + Applies(ctx, m MutatorAPI, pod) bool + Mutate(ctx, m MutatorAPI, pod) []spec.Op +} +``` + +Handlers self-register by name (`init()` -> `webhook.Register`); a blank import +of the handlers package makes them AVAILABLE. The core never names a handler. + +**Ordering = the active list.** There is no per-handler priority. The active +handler list is BOTH the selection and the dispatch order: + +```go +var DefaultHandlerOrder = []string{"fluxion", "quantum", "gang"} +``` + +Dispatch walks this list in order. `gang` is last because it is last in the +list — the fallback that applies common defaults (honor `group-size`, else +owner-derived N) only if no earlier handler already shaped the gang. A +custom-resource handler is inserted into the list before `gang` to shape its own +gang first. To change the order, or disable a handler, pass a different list. + +## Enabling/disabling handlers + +By default ALL registered handlers are enabled. Restrict the active set on the +webhook command: + +``` +fluence-webhook --handlers=fluxion,gang # run without quantum +FLUENCE_HANDLERS=fluxion,quantum,gang fluence-webhook +``` + +Empty = the default list. The list is the order: `--handlers=gang,fluxion` runs +gang first; omitting a name disables it. Unknown names are warned and dropped. + +(The handler set lives in the WEBHOOK, which mutates pods. `cmd/fluence` is the +scheduler plugin and runs no handlers.) + +## Sidecar interface + +The coordination sidecar is a handler-owned capability, not a core one. Handlers +that need a sidecar use `handlers.Sidecar`: + +```go +type Sidecar interface { + EnsureRBAC(ctx, namespace) + InterceptorOps(pod) []spec.Op + ContainerOps(pod, observe bool) []spec.Op +} +``` + +The default `coreSidecar` delegates to the core's staging primitives. The quantum +handler uses it today; a custom handler can supply its own implementation +(different image, env, gating) without touching the core or other handlers. The +core's `MutatorAPI` keeps the staging primitives only so the default +implementation can delegate — handlers do not call them directly. + +## Group size resolution (the default gang handler) + +`minCount` (the atomic-schedule count) resolves as: + +1. explicit `fluence.flux-framework.org/group-size` annotation — honored verbatim + (the override; e.g. a quantum split sets it directly); +2. else the owning indexed Job's `parallelism` (== MiniCluster size N); +3. else 1, logged. + +This is a common default available to every gang; handler-specific annotations +(quantum role, expected-workers, etc.) live in their handlers and are not +required by the core. diff --git a/examples/quantum-pod.yaml b/examples/quantum-pod.yaml index a619df9..b5dfbc9 100644 --- a/examples/quantum-pod.yaml +++ b/examples/quantum-pod.yaml @@ -2,7 +2,7 @@ # via resources (the fluence device plugin advertises fluxion.flux-framework.org/qpu # on every node, so NodeResourcesFit is satisfied). Fluence's PreFilter matches # the request against the resource graph and picks a backend, the webhook injects -# QRMI_BACKEND (the allocated backend) automatically, and note we can add other +# FLUXION_BACKEND (the allocated backend) automatically, and note we can add other # envars here in the future. I chose a webhook because I think this is going to # be a requirement, and the pod is immutable after creation. # Then the container submits via qrmi-go (the separate qrmi-sampler image). @@ -27,4 +27,4 @@ spec: requests: fluxion.flux-framework.org/qpu: "1" limits: - fluxion.flux-framework.org/qpu: "1" \ No newline at end of file + fluxion.flux-framework.org/qpu: "1" diff --git a/examples/test/e2e/gang/multi-gang-contention.yaml b/examples/test/e2e/gang/multi-gang-contention.yaml new file mode 100644 index 0000000..14b0fd8 --- /dev/null +++ b/examples/test/e2e/gang/multi-gang-contention.yaml @@ -0,0 +1,40 @@ +# Two gangs that cannot both place: fluxion allocates one core per slot, so two +# 2-pod gangs need 4 cores, but the cluster graphs ~3 (3 workers, ~1 core each). One gang places entirely; the loser stays FULLY pending +# (all-or-nothing), never partial. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang-a +spec: + replicas: 2 + selector: {matchLabels: {app: gang-a}} + template: + metadata: + labels: {app: gang-a, fluence.flux-framework.org/group: gang-a} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + containers: + - name: w + image: busybox + command: ["sleep", "3600"] + resources: {requests: {cpu: "1"}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang-b +spec: + replicas: 2 + selector: {matchLabels: {app: gang-b}} + template: + metadata: + labels: {app: gang-b, fluence.flux-framework.org/group: gang-b} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + containers: + - name: w + image: busybox + command: ["sleep", "3600"] + resources: {requests: {cpu: "1"}} diff --git a/examples/test/e2e/gang/multi-gang-requeue.yaml b/examples/test/e2e/gang/multi-gang-requeue.yaml new file mode 100644 index 0000000..a8e8636 --- /dev/null +++ b/examples/test/e2e/gang/multi-gang-requeue.yaml @@ -0,0 +1,48 @@ +# Requeue-on-capacity + gang-atomicity test (test/e2e/gang/09). +# gang-win: a 2-pod gang that runs a SHORT job and COMPLETES (pods -> Succeeded), +# freeing its nodes. +# gang-wait: a 2-pod gang needing the same nodes; loses the initial race and sits +# Unschedulable. When gang-win completes, gang-wait must be re-attempted +# (via the shortened unschedulable-recheck timeout) and place atomically. +# On a 3-worker (~3-core) cluster the two 2-pod gangs (4 cores) cannot co-run. +apiVersion: batch/v1 +kind: Job +metadata: + name: gang-win +spec: + completions: 2 + parallelism: 2 + completionMode: Indexed + template: + metadata: + labels: {fluence.flux-framework.org/group: gang-win} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","sleep 30"] # completes, frees nodes + resources: {requests: {cpu: "1"}} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: gang-wait +spec: + completions: 2 + parallelism: 2 + completionMode: Indexed + template: + metadata: + labels: {fluence.flux-framework.org/group: gang-wait} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","sleep 10"] + resources: {requests: {cpu: "1"}} \ No newline at end of file diff --git a/examples/test/e2e/gang/multi-gang.yaml b/examples/test/e2e/gang/multi-gang.yaml new file mode 100644 index 0000000..9bfa67c --- /dev/null +++ b/examples/test/e2e/gang/multi-gang.yaml @@ -0,0 +1,25 @@ +# Multi-pod gang via the WEBHOOK path (the path the experiments use +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang3 +spec: + replicas: 2 + selector: + matchLabels: {app: gang3} + template: + metadata: + labels: + app: gang3 + fluence.flux-framework.org/group: gang3 + annotations: + fluence.flux-framework.org/group-size: "2" + spec: + schedulerName: fluence + containers: + - name: worker + image: busybox + command: ["sleep", "3600"] + resources: + requests: + cpu: "1" diff --git a/examples/single-podgroup.yaml b/examples/test/e2e/gang/single-podgroup.yaml similarity index 100% rename from examples/single-podgroup.yaml rename to examples/test/e2e/gang/single-podgroup.yaml diff --git a/examples/test/e2e/quantum/quantum-gang-pods.yaml b/examples/test/e2e/quantum/quantum-gang-pods.yaml new file mode 100644 index 0000000..b345398 --- /dev/null +++ b/examples/test/e2e/quantum/quantum-gang-pods.yaml @@ -0,0 +1,49 @@ +# Gang + submitter quantum workload for the e2e (no leader/worker). +# +# Two pods, identical, both requesting the quantum resource, in group "qgang". +# The user authors NO roles and NO submitter — the webhook treats this as a gang +# of full size N=2 (group-size makes N deterministic for raw pods, which have no +# owning Job/Deployment to derive it from), gates every pod, and ADDITIONALLY +# creates the one-off submitter pod "qgang-submitter" (its own group-of-one) that +# runs the real submit and ungates the gang. busybox stands in for the quantum +# app; the interceptor staging fails soft (no python), which is fine for the +# structural assertions in 02/03/04. +apiVersion: v1 +kind: Pod +metadata: + name: qgang-0 + labels: + app: qgang + fluence.flux-framework.org/group: qgang + annotations: + fluence.flux-framework.org/group-size: "2" +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh", "-c", "echo gang member; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} +--- +apiVersion: v1 +kind: Pod +metadata: + name: qgang-1 + labels: + app: qgang + fluence.flux-framework.org/group: qgang + annotations: + fluence.flux-framework.org/group-size: "2" +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh", "-c", "echo gang member; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} \ No newline at end of file diff --git a/examples/test/e2e/quantum-pod-mock.yaml b/examples/test/e2e/quantum/quantum-pod-mock.yaml similarity index 100% rename from examples/test/e2e/quantum-pod-mock.yaml rename to examples/test/e2e/quantum/quantum-pod-mock.yaml diff --git a/examples/test/e2e/sidecar-mock-pods.yaml b/examples/test/e2e/sidecar-mock-pods.yaml deleted file mode 100644 index fb223a7..0000000 --- a/examples/test/e2e/sidecar-mock-pods.yaml +++ /dev/null @@ -1,64 +0,0 @@ ---- -# Leader pod — first admitted, webhook creates PodGroup, injects sidecar, creates RBAC -# User only needs schedulerName: fluence and the quantum-group label. -# No PodGroup object needed — Fluence creates it. -apiVersion: v1 -kind: Pod -metadata: - name: sidecar-test-leader - labels: - app: fluence-sidecar-test - fluence.flux-framework.org/group: sidecar-test-group -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: mock-quantum-app - image: busybox - command: - - sh - - -c - - | - echo "mock-quantum-app: running" - echo "arn:aws:braket:us-east-1:123456:quantum-task/mock-abc123" \ - > /tmp/task-arn - echo "mock-quantum-app: task ARN written" - sleep 3600 - resources: - requests: - fluxion.flux-framework.org/qpu: "1" - limits: - fluxion.flux-framework.org/qpu: "1" - ---- -# Worker pod — classical (no QPU). Gated by the webhook because it is a -# non-leader member of a group whose leader is a quantum pod. -apiVersion: v1 -kind: Pod -metadata: - name: sidecar-test-worker - labels: - app: fluence-sidecar-test - fluence.flux-framework.org/group: sidecar-test-group -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: classical-worker - image: busybox - command: - - sh - - -c - - | - echo "classical-worker: started" - echo "TASK_ARN=$BRAKET_TASK_ARN" - sleep 10 - env: - - name: FLUENCE_QUANTUM_JOB_ID - valueFrom: - fieldRef: - fieldPath: metadata.annotations['fluence.flux-framework.org/quantum-job-id'] - resources: - requests: - cpu: "100m" - memory: "128Mi" diff --git a/pkg/fluence/fluence.go b/pkg/fluence/fluence.go index a1a10e1..fd3b080 100644 --- a/pkg/fluence/fluence.go +++ b/pkg/fluence/fluence.go @@ -77,14 +77,61 @@ type Fluence struct { mu sync.Mutex // placement maps a group key to its allocation (nodes, backend, jobids). placement map[string]groupAlloc + // excludedNodes maps a group key to the set of nodes that are GENUINELY + // INCOMPATIBLE with that group (PostFilter saw UnschedulableAndUnresolvable + // from another plugin: a taint, affinity, or constraint Fluxion's graph does + // not model). PreFilter feeds them back as an RFC 31 negated-hostlist + // constraint so the re-match is steered onto other nodes. Nodes that were + // merely BUSY are deliberately NOT recorded here (excluding them would turn + // transient contention into permanent group failure). The set only grows for a + // group, so the exclusion-driven re-match is finite, and it is cleared on + // teardown. Guarded by mu. + excludedNodes map[string]map[string]bool } var ( - _ fwk.PreFilterPlugin = (*Fluence)(nil) - _ fwk.FilterPlugin = (*Fluence)(nil) - _ fwk.PreBindPlugin = (*Fluence)(nil) + _ fwk.PreFilterPlugin = (*Fluence)(nil) + _ fwk.FilterPlugin = (*Fluence)(nil) + _ fwk.PostFilterPlugin = (*Fluence)(nil) + _ fwk.ReservePlugin = (*Fluence)(nil) + _ fwk.PreBindPlugin = (*Fluence)(nil) ) +// schedulableNodes returns only the nodes a normal pod could actually be placed +// on, so the Fluxion graph never offers a node that Kubernetes will then reject +// in Filter. Two kinds are dropped: +// +// - cordoned nodes (spec.unschedulable), and +// - nodes carrying a NoSchedule/NoExecute taint (e.g. the control-plane's +// node-role.kubernetes.io/control-plane:NoSchedule). +// +// Without this, Fluxion can place a gang slot on the control-plane (it looks like +// a valid virtual=false compute node to the graph), the pod is then rejected by +// TaintToleration with UnschedulableAndUnresolvable, and PostFilter abandons the +// whole allocation — on a small cluster that strands the gang permanently. We do +// not attempt to honor specific tolerations here: gang workloads in this setup do +// not tolerate node taints, so any NoSchedule/NoExecute taint means "not for us". +func schedulableNodes(nodes []corev1.Node) []corev1.Node { + out := make([]corev1.Node, 0, len(nodes)) + for _, n := range nodes { + if n.Spec.Unschedulable { + continue + } + tainted := false + for _, t := range n.Spec.Taints { + if t.Effect == corev1.TaintEffectNoSchedule || t.Effect == corev1.TaintEffectNoExecute { + tainted = true + break + } + } + if tainted { + continue + } + out = append(out, n) + } + return out +} + // New builds the plugin: discover cluster nodes, optionally inject quantum // resources, write the JGF graph, initialize the Fluxion matcher, and register // the delete handlers that cancel allocations when their owning object is gone. @@ -129,7 +176,7 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error } } - jgfBytes, err := cluster.BuildGraph(nodeList.Items, opts) + jgfBytes, err := cluster.BuildGraph(schedulableNodes(nodeList.Items), opts) if err != nil { return nil, fmt.Errorf("build resource graph: %w", err) } @@ -161,10 +208,11 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error fluxion.Init(tmp.Name(), os.Getenv("FLUENCE_MATCH_POLICY"), "") f := &Fluence{ - handle: h, - matcher: fluxion, - knownDevices: knownDevices, - placement: map[string]groupAlloc{}, + handle: h, + matcher: fluxion, + knownDevices: knownDevices, + placement: map[string]groupAlloc{}, + excludedNodes: map[string]map[string]bool{}, } f.registerCancelHandlers() // Periodic + startup reconcile of completed Fluence-created PodGroups, so a @@ -251,7 +299,15 @@ func (f *Fluence) PreFilter( return nil, fwk.AsStatus(err) } - specs, err := placement.JobspecsForGroup(group, pods, f.knownDevices) + f.mu.Lock() + excluded := make([]string, 0, len(f.excludedNodes[group])) + for n := range f.excludedNodes[group] { + excluded = append(excluded, n) + } + f.mu.Unlock() + sort.Strings(excluded) // deterministic constraint for stable matching/logs + + specs, err := placement.JobspecsForGroup(group, pods, f.knownDevices, excluded) if err != nil { return nil, fwk.AsStatus(err) } @@ -390,6 +446,103 @@ func (f *Fluence) Filter( return fwk.NewStatus(fwk.Unschedulable, "node not in fluxion allocation for this group") } +// PostFilter runs when a pod could not be scheduled after Filter — for a Fluence +// group, this means the cached Fluxion allocation's nodes did not all survive the +// other scheduler plugins' Filter checks. Without intervention the group would +// retry forever against the same cached allocation while the Fluxion reservation +// leaked, because PreFilter short-circuits on the cache and nothing else releases +// it on a scheduling failure. +// +// We always abandon the failed allocation here (cancel the Fluxion jobids, drop +// the cached placement) so the next PreFilter re-matches fresh. The careful part +// is WHICH nodes we then permanently exclude from the group's future matches, +// because a group reaches PostFilter for two very different reasons and they must +// be handled oppositely (see fwk.Code docs): +// +// - UnschedulableAndUnresolvable: the node genuinely cannot host this pod and +// re-trying it is pointless (a taint the pod does not tolerate, node affinity +// mismatch, a constraint Fluxion's graph does not model). EXCLUDE it; the +// next PreFilter feeds the exclusion set back as an RFC 31 negated-hostlist +// constraint so Fluxion is steered onto other nodes. +// +// - Unschedulable (plain): the node could host the pod, just not at this +// instant (it is momentarily full). This is TRANSIENT. Do NOT exclude it — +// excluding a merely-busy node converts ordinary contention into permanent +// group failure, and in a saturated cluster (a gang that needs the whole node +// set) it strands the gang forever even though it would fit once a node frees. +// +// So contention excludes nothing and the group recovers by waiting/retrying; +// only durable incompatibility accumulates in excludedNodes (cleared on group +// teardown), which keeps the exclusion-driven re-match finite and correct. +func (f *Fluence) PostFilter( + ctx context.Context, + state fwk.CycleState, + pod *corev1.Pod, + filteredNodeStatusMap fwk.NodeToStatusReader, +) (*fwk.PostFilterResult, *fwk.Status) { + group := groupKey(pod) + + f.mu.Lock() + alloc, ok := f.placement[group] + if !ok { + // No cached allocation for this group — nothing of ours to reconcile. + // (Another plugin's PostFilter, or a non-group pod.) + f.mu.Unlock() + return nil, fwk.NewStatus(fwk.Unschedulable) + } + // Exclude ONLY nodes that are genuinely incompatible with this pod, never + // nodes that were merely busy this cycle. The framework gives us a per-node + // status: UnschedulableAndUnresolvable means the node cannot host the pod and + // re-trying it is pointless (a taint the pod does not tolerate, node affinity + // mismatch, a constraint Fluxion's graph does not model) -> exclude it so the + // re-match is steered elsewhere. A plain Unschedulable means the node could + // host the pod but not right now (it is momentarily full) -> do NOT exclude + // it; it must stay eligible so the group can land there once capacity frees. + // + // This is the whole point: a group enters PostFilter for many reasons, and + // "the cluster is just full at this instant" is the common one. Permanently + // banning the busy nodes (the old whole-allocation exclusion) turned transient + // contention into permanent group failure — exactly backwards. Now contention + // excludes nothing; the group simply abandons this cycle's reservation and + // retries the same nodes when they free. + if f.excludedNodes[group] == nil { + f.excludedNodes[group] = map[string]bool{} + } + var incompatible, busy []string + for _, n := range alloc.place.Nodes { + var code fwk.Code + if filteredNodeStatusMap != nil { + if st := filteredNodeStatusMap.Get(n); st != nil { + code = st.Code() + } + } + if code == fwk.UnschedulableAndUnresolvable { + f.excludedNodes[group][n] = true + incompatible = append(incompatible, n) + } else { + // plain Unschedulable, Success, or unknown/nil -> transient, keep. + busy = append(busy, n) + } + } + excludedCount := len(f.excludedNodes[group]) + jobids := alloc.jobids + delete(f.placement, group) + f.mu.Unlock() + + // Release the Fluxion reservation for the abandoned allocation so the graph + // does not leak it while the group retries. + f.cancelJobids(jobids) + + log.Printf("[fluence] group %s unschedulable: abandoning allocation (jobids %v); "+ + "incompatible(excluded)=%v busy(retryable, NOT excluded)=%v; %d node(s) excluded total", + group, jobids, incompatible, busy, excludedCount) + + // Returning Unschedulable (no nominated node) lets the pod be requeued; the + // next PreFilter re-matches (with any incompatible nodes excluded, but busy + // nodes still in play). Fluxion, not PostFilter preemption, chooses placement. + return nil, fwk.NewStatus(fwk.Unschedulable) +} + // PreBindPreFlight runs before PreBind. It returns Success when we have a cached // allocation for the pod's group (so PreBind can record the jobid, and stamp the // backend for a quantum pod), and Skip otherwise. @@ -408,12 +561,59 @@ func (f *Fluence) PreBindPreFlight( return nil, fwk.NewStatus(fwk.Success) } +// Reserve stamps the chosen backend (and matched attributes) onto the pod as +// early as possible — at reservation, in the scheduling cycle — rather than in +// PreBind. The webhook injects FLUXION_BACKEND (and FLUXION_) as a +// downward-API env sourced from these annotations; downward-API env is resolved +// by the kubelet when the container starts and is NOT updated afterward, so the +// annotation must be present well before the container starts. PreBind runs in +// the (asynchronous) binding cycle, milliseconds before Bind, which races the +// kubelet — Reserve runs earlier and synchronously, giving the annotation time +// to propagate so the value reliably surfaces in the container. +func (f *Fluence) Reserve( + ctx context.Context, + state fwk.CycleState, + pod *corev1.Pod, + nodeName string, +) *fwk.Status { + if err := f.stampBackend(ctx, pod); err != nil { + return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) + } + return fwk.NewStatus(fwk.Success) +} + +// Unreserve is a no-op: a stale backend annotation from a reservation that was +// later rejected is harmless (it is overwritten on the next attempt and the +// value is correct for the allocation that produced it), and clearing it would +// cost an extra API call. Required to satisfy fwk.ReservePlugin. +func (f *Fluence) Unreserve(ctx context.Context, state fwk.CycleState, pod *corev1.Pod, nodeName string) { +} + +// stampBackend writes the allocated backend name and matched attributes onto the +// pod (idempotent merge patch). No-op when there is no cached allocation or the +// allocation carries no backend (classical, non-quantum gangs). +func (f *Fluence) stampBackend(ctx context.Context, pod *corev1.Pod) error { + f.mu.Lock() + alloc, ok := f.placement[groupKey(pod)] + f.mu.Unlock() + if !ok || alloc.place.Backend == "" { + return nil + } + ann := map[string]string{placement.BackendAnnotation: alloc.place.Backend} + for k, v := range alloc.place.BackendAttributes { + ann[placement.AttributeAnnotationPrefix+k] = v + } + log.Printf("[fluence] group %s -> backend %q attrs %v (reserve-stamped, nodes %v)", + groupKey(pod), alloc.place.Backend, alloc.place.BackendAttributes, alloc.place.Nodes) + return f.patchPodAnnotations(ctx, pod.Namespace, pod.Name, ann) +} + // PreBind records, in the commit phase, the durable state for this group: -// - the Fluxion jobid onto the owning object (the PodGroup for a gang, else the -// pod) so the allocation can be cancelled when that object is deleted; -// - for a quantum group, the allocated backend onto the pod, which the webhook- -// injected downward-API env surfaces as QRMI_BACKEND (container env is -// immutable post-creation, so the value must travel via an annotation). +// the Fluxion jobid onto the owning object (the PodGroup for a gang, else the +// pod) so the allocation can be cancelled when that object is deleted. The +// backend annotation is stamped earlier, in Reserve (see stampBackend), because +// the webhook-injected downward-API env (FLUXION_BACKEND) must be present before +// the container starts; PreBind is too late and races the kubelet. func (f *Fluence) PreBind( ctx context.Context, state fwk.CycleState, @@ -430,20 +630,10 @@ func (f *Fluence) PreBind( if err := f.recordJobIDs(ctx, pod, alloc.jobids); err != nil { return fwk.AsStatus(fmt.Errorf("record jobids: %w", err)) } - if alloc.place.Backend != "" { - // Stamp the backend name and all matched attributes in one patch. The - // webhook injects a normalized env per annotation so the workload reads - // exactly what it matched (backend + region/qubits/...). - ann := map[string]string{placement.BackendAnnotation: alloc.place.Backend} - for k, v := range alloc.place.BackendAttributes { - ann[placement.AttributeAnnotationPrefix+k] = v - } - log.Printf("[fluence] group %s -> backend %q attrs %v (nodes %v, jobids %v)", - groupKey(pod), alloc.place.Backend, alloc.place.BackendAttributes, - alloc.place.Nodes, alloc.jobids) - if err := f.patchPodAnnotations(ctx, pod.Namespace, pod.Name, ann); err != nil { - return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) - } + // Backstop: if Reserve was skipped for any reason, ensure the backend is + // stamped before bind anyway (idempotent). + if err := f.stampBackend(ctx, pod); err != nil { + return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) } return fwk.NewStatus(fwk.Success) } @@ -637,6 +827,20 @@ func (f *Fluence) reconcileGroup(ctx context.Context, namespace, group string) { } log.Printf("fluence: reconciled completed gang %s/%s — deleted Fluence-created PodGroup, allocation freed", namespace, group) + + // Gang+submitter cleanup: the one-off quantum submitter pod and its + // group-of-one PodGroup (-submitter) are not owned by the user's + // workload, so reap them alongside the gang. The submitter pod also carries + // an ownerReference to this gang PodGroup (so its deletion cascades via GC); + // this explicit delete is the backstop and also removes the submitter's own + // PodGroup. Skip when this group is itself a submitter group, to avoid + // recursing on -submitter-submitter. + if !strings.HasSuffix(group, submitterGroupSuffix) { + sg := group + submitterGroupSuffix + _ = f.handle.ClientSet().SchedulingV1alpha2().PodGroups(namespace).Delete(ctx, sg, metav1.DeleteOptions{}) + _ = f.handle.ClientSet().CoreV1().Pods(namespace).Delete(ctx, sg, metav1.DeleteOptions{}) + log.Printf("fluence: reaped submitter %s/%s for gang %s", namespace, sg, group) + } } // reconcileGraceForEmpty is how long a Fluence-created PodGroup with no live @@ -648,6 +852,12 @@ const reconcileGraceForEmpty = 2 * time.Minute // package (the scheduler must not depend on the webhook). Kept in sync with it. const webhookGroupLabel = "fluence.flux-framework.org/group" +// submitterGroupSuffix mirrors handlers.SubmitterGroupSuffix: the one-off quantum +// submitter for gang is named -submitter (both the pod and its PodGroup). +// Duplicated here to avoid importing the webhook handlers package into the +// scheduler plugin; keep the two in sync. +const submitterGroupSuffix = "-submitter" + // onPodGroupDeleted frees the gang's allocation when its PodGroup is deleted. func (f *Fluence) onPodGroupDeleted(obj interface{}) { pg, ok := obj.(*schedv1a2.PodGroup) @@ -718,6 +928,7 @@ func (f *Fluence) cancelGroup(key string, ann map[string]string) { f.mu.Lock() delete(f.placement, key) + delete(f.excludedNodes, key) // drop accumulated exclusions so a future group reusing the name starts clean f.mu.Unlock() } diff --git a/pkg/fluence/fluence_test.go b/pkg/fluence/fluence_test.go index 998e1a7..5228f97 100644 --- a/pkg/fluence/fluence_test.go +++ b/pkg/fluence/fluence_test.go @@ -1,6 +1,7 @@ package fluence import ( + "context" "errors" "testing" @@ -12,6 +13,7 @@ import ( schedv1a2 "k8s.io/api/scheduling/v1alpha2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" + fwk "k8s.io/kube-scheduler/framework" ) // fakeMatcher records Cancel calls so cancel behavior can be asserted without @@ -46,7 +48,11 @@ func (m *fakeMatcher) Cancel(jobid uint64) error { } func newTestFluence(m matcher) *Fluence { - return &Fluence{matcher: m, placement: map[string]groupAlloc{}} + return &Fluence{ + matcher: m, + placement: map[string]groupAlloc{}, + excludedNodes: map[string]map[string]bool{}, + } } func ann(jobid string) map[string]string { @@ -345,3 +351,205 @@ func twoSpecs() []*jobspec.Jobspec { {Version: 9999}, } } + +// --- PostFilter allocation reconciliation ----------------------------------- + +// fakeNodeStatus is a minimal fwk.NodeToStatusReader for PostFilter tests: it +// maps node name -> status code so a test can mark some nodes incompatible +// (UnschedulableAndUnresolvable) and others merely busy (Unschedulable). +type fakeNodeStatus map[string]fwk.Code + +func (s fakeNodeStatus) Get(node string) *fwk.Status { + if c, ok := s[node]; ok { + return fwk.NewStatus(c) + } + return nil +} +func (s fakeNodeStatus) NodesForStatusCode(fwk.NodeInfoLister, fwk.Code) ([]fwk.NodeInfo, error) { + return nil, nil +} + +// PostFilter abandons the failed allocation (cancel jobids, drop cache) and +// excludes ONLY genuinely-incompatible nodes (UnschedulableAndUnresolvable). +// A node that was merely busy (plain Unschedulable) MUST stay eligible. +func TestPostFilterExcludesOnlyIncompatibleNodes(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{ + place: placement.Placement{Nodes: []string{"node-a", "node-b", "node-c"}}, + jobids: []uint64{11, 12}, + } + pod := groupedPod("default", "training-0", "training", nil) + + // node-a incompatible (taint); node-b busy; node-c survived Filter. + status := fakeNodeStatus{ + "node-a": fwk.UnschedulableAndUnresolvable, + "node-b": fwk.Unschedulable, + "node-c": fwk.Success, + } + + _, st := f.PostFilter(context.Background(), nil, pod, status) + if st == nil || st.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable status, got %v", st) + } + if _, still := f.placement[key]; still { + t.Fatal("placement cache should be deleted after PostFilter") + } + if len(m.cancelled) != 2 { + t.Fatalf("expected both jobids cancelled, got %v", m.cancelled) + } + excl := f.excludedNodes[key] + if !excl["node-a"] { + t.Fatalf("incompatible node-a should be excluded, set=%v", excl) + } + if excl["node-b"] || excl["node-c"] { + t.Fatalf("busy/ok nodes must NOT be excluded (would strand a saturated gang), set=%v", excl) + } + if len(excl) != 1 { + t.Fatalf("expected exactly 1 excluded node, got %v", excl) + } +} + +// A group blocked purely by contention (every node merely busy) excludes NOTHING +// so it can retry the same nodes once they free — the saturated-cluster property. +func TestPostFilterContentionExcludesNothing(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{ + place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, + jobids: []uint64{1}, + } + pod := groupedPod("default", "training-0", "training", nil) + status := fakeNodeStatus{"node-a": fwk.Unschedulable, "node-b": fwk.Unschedulable} + + f.PostFilter(context.Background(), nil, pod, status) + + if len(f.excludedNodes[key]) != 0 { + t.Fatalf("a purely-busy group must exclude no nodes, got %v", f.excludedNodes[key]) + } + if _, still := f.placement[key]; still { + t.Fatal("placement cache should be deleted even when nothing is excluded") + } + if len(m.cancelled) != 1 { + t.Fatalf("expected the jobid cancelled, got %v", m.cancelled) + } +} + +// A nil status map (e.g. all nodes filtered out upstream) must be safe and +// exclude nothing rather than panic or ban the whole allocation. +func TestPostFilterNilStatusMapExcludesNothing(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{7}} + pod := groupedPod("default", "training-0", "training", nil) + + _, st := f.PostFilter(context.Background(), nil, pod, nil) + if st == nil || st.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable, got %v", st) + } + if len(f.excludedNodes[key]) != 0 { + t.Fatalf("nil status map must exclude nothing, got %v", f.excludedNodes[key]) + } +} + +// Incompatible nodes accumulate across attempts; busy ones never do. +func TestPostFilterAccumulatesIncompatibleAcrossAttempts(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + pod := groupedPod("default", "training-0", "training", nil) + + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{1}} + f.PostFilter(context.Background(), nil, pod, fakeNodeStatus{"node-a": fwk.UnschedulableAndUnresolvable, "node-b": fwk.Unschedulable}) + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-c", "node-d"}}, jobids: []uint64{2}} + f.PostFilter(context.Background(), nil, pod, fakeNodeStatus{"node-c": fwk.UnschedulableAndUnresolvable, "node-d": fwk.Unschedulable}) + + excl := f.excludedNodes[key] + for _, n := range []string{"node-a", "node-c"} { + if !excl[n] { + t.Fatalf("incompatible %s should accumulate, got %v", n, excl) + } + } + if excl["node-b"] || excl["node-d"] { + t.Fatalf("busy nodes must never accumulate, got %v", excl) + } + if len(excl) != 2 { + t.Fatalf("exclusion set should be the 2 incompatible nodes, got %v", excl) + } +} + +// PostFilter on a group with no cached allocation (not ours, or already cleared) +// is a safe no-op: no panic, no cancel, returns Unschedulable. +func TestPostFilterUnknownGroupNoop(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + pod := groupedPod("default", "stranger-0", "stranger", nil) + + _, status := f.PostFilter(context.Background(), nil, pod, nil) + if status == nil || status.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable, got %v", status) + } + if len(m.cancelled) != 0 { + t.Fatalf("nothing should be cancelled for an unknown group, got %v", m.cancelled) + } + if len(f.excludedNodes) != 0 { + t.Fatalf("no exclusion set should be created for an unknown group, got %v", f.excludedNodes) + } +} + +// Teardown (cancelGroup) must clear the exclusion set so a future group reusing +// the same key does not inherit stale exclusions. +func TestCancelGroupClearsExclusions(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{jobids: []uint64{9}} + f.excludedNodes[key] = map[string]bool{"node-a": true} + + f.cancelGroup(key, ann("9")) + + if _, still := f.excludedNodes[key]; still { + t.Fatal("exclusion set should be cleared on teardown") + } +} + +// schedulableNodes must drop control-plane (NoSchedule taint), NoExecute-tainted, +// and cordoned nodes, keeping only nodes a normal gang pod can actually land on. +// This keeps the Fluxion graph from offering nodes Kubernetes will reject in +// Filter (which, with whole-allocation PostFilter exclusion, strands the gang). +func TestSchedulableNodesDropsTaintedAndCordoned(t *testing.T) { + node := func(name string, unsched bool, effects ...corev1.TaintEffect) corev1.Node { + n := corev1.Node{} + n.Name = name + n.Spec.Unschedulable = unsched + for _, e := range effects { + n.Spec.Taints = append(n.Spec.Taints, corev1.Taint{Key: "k", Effect: e}) + } + return n + } + in := []corev1.Node{ + node("worker-1", false), + node("worker-2", false), + node("control-plane", false, corev1.TaintEffectNoSchedule), + node("draining", false, corev1.TaintEffectNoExecute), + node("cordoned", true), + node("prefer-only", false, corev1.TaintEffectPreferNoSchedule), // soft taint: keep + } + got := schedulableNodes(in) + gotNames := map[string]bool{} + for _, n := range got { + gotNames[n.Name] = true + } + want := []string{"worker-1", "worker-2", "prefer-only"} + if len(got) != len(want) { + t.Fatalf("expected %d schedulable nodes %v, got %d %v", len(want), want, len(got), gotNames) + } + for _, w := range want { + if !gotNames[w] { + t.Fatalf("expected %s kept, got set %v", w, gotNames) + } + } +} diff --git a/pkg/placement/placement.go b/pkg/placement/placement.go index 554f319..c7f76de 100644 --- a/pkg/placement/placement.go +++ b/pkg/placement/placement.go @@ -214,14 +214,36 @@ func withEntries(counts map[string]int) []jobspec.Resource { // allocation (duration 0 runs to graph end) plus an RFC 31 property constraint // selecting the eligible node set. properties is the AND-set of composed // key=value property strings a matched node must carry. -func systemAttributes(properties []string) map[string]interface{} { +func systemAttributes(properties []string, excludeNodes []string) map[string]interface{} { + // Base property constraint (the eligible-node property AND-set). + constraints := map[string]interface{}{ + "properties": properties, + } + // When a group has had a placement rejected by other scheduler plugins + // (taints, affinity, volume topology that Fluxion's graph does not model), + // PostFilter accumulates the rejected hostnames and we AND in an RFC 31 + // negated hostlist so the re-match is forced onto untried nodes. RFC 31 is + // JsonLogic-style ({operator:[values]}, one operator per object), so to AND + // two operators we nest them under an explicit `and`. We only do this when + // there is something to exclude, so the no-exclusion jobspec is byte-for-byte + // what it was before (and existing tests/behavior are unchanged). + if len(excludeNodes) > 0 { + constraints = map[string]interface{}{ + "and": []interface{}{ + map[string]interface{}{"properties": properties}, + map[string]interface{}{ + "not": []interface{}{ + map[string]interface{}{"hostlist": excludeNodes}, + }, + }, + }, + } + } return map[string]interface{}{ "system": map[string]interface{}{ // duration 0 => hold the allocation until we explicitly Cancel. - "duration": 0, - "constraints": map[string]interface{}{ - "properties": properties, - }, + "duration": 0, + "constraints": constraints, }, } } @@ -229,7 +251,7 @@ func systemAttributes(properties []string) map[string]interface{} { // computeJobspec builds the physical-compute jobspec for a group: one slot per // pod holding the compute resources, constrained to virtual=false nodes. This is // the only jobspec for a group that requests no virtual devices. -func computeJobspec(groupName string, slots int, compute map[string]int) *jobspec.Jobspec { +func computeJobspec(groupName string, slots int, compute map[string]int, excludeNodes []string) *jobspec.Jobspec { return &jobspec.Jobspec{ Version: 9999, Resources: []jobspec.Resource{{ @@ -238,7 +260,7 @@ func computeJobspec(groupName string, slots int, compute map[string]int) *jobspe Label: "default", With: withEntries(compute), }}, - Attributes: systemAttributes([]string{VirtualPropertyFalse}), + Attributes: systemAttributes([]string{VirtualPropertyFalse}, excludeNodes), Tasks: []jobspec.Task{{ Command: []string{groupName}, Slot: "default", @@ -272,7 +294,7 @@ func deviceJobspec(groupName, deviceType string, count int, extraProps []string) Label: "device", With: []jobspec.Resource{{Type: "node", Count: count}}, }}, - Attributes: systemAttributes(props), + Attributes: systemAttributes(props, nil), Tasks: []jobspec.Task{{ Command: []string{groupName}, Slot: "device", @@ -299,6 +321,7 @@ func JobspecsForGroup( groupName string, pods []corev1.Pod, knownDevices map[string]bool, + excludeNodes []string, ) ([]*jobspec.Jobspec, error) { if len(pods) == 0 { return nil, fmt.Errorf("pod group %q has no pods", groupName) @@ -321,7 +344,7 @@ func JobspecsForGroup( } } - specs := []*jobspec.Jobspec{computeJobspec(groupName, len(pods), compute)} + specs := []*jobspec.Jobspec{computeJobspec(groupName, len(pods), compute, excludeNodes)} // Deterministic device order for stable output. deviceTypes := make([]string, 0, len(devices)) diff --git a/pkg/placement/placement_test.go b/pkg/placement/placement_test.go index 33786c8..fe68917 100644 --- a/pkg/placement/placement_test.go +++ b/pkg/placement/placement_test.go @@ -64,7 +64,7 @@ func TestClassicalSingleMatch(t *testing.T) { podWith("p0", corev1.ResourceList{corev1.ResourceCPU: qty(4), "nvidia.com/gpu": qty(1)}), podWith("p1", corev1.ResourceList{corev1.ResourceCPU: qty(4), "nvidia.com/gpu": qty(1)}), } - specs, err := JobspecsForGroup("grp", pods, nil) + specs, err := JobspecsForGroup("grp", pods, nil, nil) if err != nil { t.Fatal(err) } @@ -101,7 +101,7 @@ func TestGroupDeviceMatchWhenLeaderNotFirst(t *testing.T) { }) // Leader deliberately placed last. pods := []corev1.Pod{worker, worker, leader} - specs, err := JobspecsForGroup("qgrp", pods, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("qgrp", pods, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -132,7 +132,7 @@ func qpuPodWithRequires(name string, requires map[string]string) corev1.Pod { // constraints, nothing extra (over-constraining would break unconstrained runs). func TestNoRequireAnnotationsAddsNoConstraints(t *testing.T) { p := qpuPodWithRequires("q", nil) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -145,7 +145,7 @@ func TestNoRequireAnnotationsAddsNoConstraints(t *testing.T) { // Exactly one require- constraint. func TestSingleRequireConstraint(t *testing.T) { p := qpuPodWithRequires("q", map[string]string{"qrmi_type": "braket-gate"}) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -169,7 +169,7 @@ func TestMultipleRequireConstraintsAreDeduped(t *testing.T) { // a worker that happens to repeat one of the same require- annotations worker := qpuPodWithRequires("w0", map[string]string{"vendor": "amazon"}) specs, err := JobspecsForGroup("g", []corev1.Pod{leader, worker}, - map[string]bool{"qpu": true}) + map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -211,7 +211,7 @@ func TestRequireAnnotationConstrainsDevice(t *testing.T) { leader.Annotations[RequireAnnotationPrefix+"vendor"] = "amazon" specs, err := JobspecsForGroup("qgrp", []corev1.Pod{leader}, - map[string]bool{"qpu": true}) + map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -232,7 +232,7 @@ func TestDeviceProducesSecondMatch(t *testing.T) { FluxionResourcePrefix + "qpu": qty(1), }) known := map[string]bool{"qpu": true} - specs, err := JobspecsForGroup("qgrp", []corev1.Pod{p}, known) + specs, err := JobspecsForGroup("qgrp", []corev1.Pod{p}, known, nil) if err != nil { t.Fatal(err) } @@ -274,7 +274,7 @@ func TestDeviceProducesSecondMatch(t *testing.T) { // node), so there are two matches: compute (core=1, virtual=false) and device. func TestDeviceOnlyStillForcesCompute(t *testing.T) { p := podWith("q", corev1.ResourceList{FluxionResourcePrefix + "qpu": qty(1)}) - specs, err := JobspecsForGroup("qonly", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("qonly", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -289,7 +289,7 @@ func TestDeviceOnlyStillForcesCompute(t *testing.T) { // Requesting a device type the graph does not model is a hard error. func TestUnknownDeviceErrors(t *testing.T) { p := podWith("q", corev1.ResourceList{FluxionResourcePrefix + "fpga": qty(1)}) - _, err := JobspecsForGroup("grp", []corev1.Pod{p}, map[string]bool{"qpu": true}) + _, err := JobspecsForGroup("grp", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err == nil { t.Fatal("expected an error for an unmodeled device type") } @@ -301,7 +301,7 @@ func TestHoldDurationZero(t *testing.T) { corev1.ResourceCPU: qty(1), FluxionResourcePrefix + "qpu": qty(1), }) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -366,3 +366,76 @@ func TestPlacementUnmarkedNodeIsCompute(t *testing.T) { t.Fatalf("unmarked node should not be a backend, got %q", p.Backend) } } + +// When excludeNodes is non-empty, the compute jobspec's constraint must AND the +// base properties with an RFC 31 negated hostlist, so a re-match avoids the +// rejected nodes. When empty, the constraint must be the plain properties form +// (byte-for-byte the pre-exclusion behavior). +func TestExcludeNodesAddsNegatedHostlist(t *testing.T) { + p := podWith("p", corev1.ResourceList{corev1.ResourceCPU: qty(1)}) + + // no exclusion -> plain properties, no `and`/`not` + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, nil, nil) + if err != nil { + t.Fatal(err) + } + cons := computeConstraints(t, specs[0]) + if _, hasAnd := cons["and"]; hasAnd { + t.Fatalf("no-exclusion constraint must not use `and`: %#v", cons) + } + if _, hasProps := cons["properties"]; !hasProps { + t.Fatalf("no-exclusion constraint must have plain properties: %#v", cons) + } + + // with exclusion -> and[ properties, not[ hostlist ] ] + specs, err = JobspecsForGroup("g", []corev1.Pod{p}, nil, []string{"node-b", "node-c"}) + if err != nil { + t.Fatal(err) + } + cons = computeConstraints(t, specs[0]) + andTerms, ok := cons["and"].([]interface{}) + if !ok || len(andTerms) != 2 { + t.Fatalf("exclusion constraint must be `and` of 2 terms: %#v", cons) + } + // find the not/hostlist term + foundHostlist := false + for _, term := range andTerms { + tm, _ := term.(map[string]interface{}) + notTerm, ok := tm["not"].([]interface{}) + if !ok || len(notTerm) == 0 { + continue + } + inner, _ := notTerm[0].(map[string]interface{}) + hl, ok := inner["hostlist"].([]string) + if !ok { + // json round-trip may make it []interface{}; accept both + if hlAny, ok2 := inner["hostlist"].([]interface{}); ok2 { + if len(hlAny) == 2 { + foundHostlist = true + } + } + continue + } + if len(hl) == 2 { + foundHostlist = true + } + } + if !foundHostlist { + t.Fatalf("exclusion constraint must contain not[hostlist[2 nodes]]: %#v", cons) + } +} + +// computeConstraints digs out attributes.system.constraints from the compute +// jobspec (the first spec; device specs do not carry node exclusions). +func computeConstraints(t *testing.T, spec *jobspec.Jobspec) map[string]interface{} { + t.Helper() + sys, ok := spec.Attributes["system"].(map[string]interface{}) + if !ok { + t.Fatalf("no system attributes: %#v", spec.Attributes) + } + cons, ok := sys["constraints"].(map[string]interface{}) + if !ok { + t.Fatalf("no constraints: %#v", sys) + } + return cons +} diff --git a/pkg/webhook/handler.go b/pkg/webhook/handler.go index 82a1227..61b97b1 100644 --- a/pkg/webhook/handler.go +++ b/pkg/webhook/handler.go @@ -25,34 +25,32 @@ type MutatorAPI interface { // InjectedEnv is the FLUXION_* env contract the scheduler/webhook supplies. InjectedEnv() []corev1.EnvVar - // PodGroup operations (gang scheduling). Group identity is the value of the - // group label, which the core treats as an opaque string. - PodGroupLeader(ctx context.Context, namespace, group string) string - EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string) - RecordLeader(ctx context.Context, namespace, group, leaderPod string) - - // EnsureSidecarRBAC provisions the per-namespace ServiceAccount/Role/Binding - // the sidecar needs. - EnsureSidecarRBAC(ctx context.Context, namespace string) - - // InterceptorOps stages the fluence package into the quantum container via an - // init container + shared volume on PYTHONPATH (Model C). SidecarContainerOps - // adds the sidecar container (observe=true => observe-only telemetry mode). - InterceptorOps(pod *corev1.Pod) []spec.Op - SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op + // EnsurePodGroup creates the group's PodGroup with the given gang minCount if + // it does not already exist (idempotent). Group identity is the opaque value + // of the group label. creatorPod is recorded only as the PodGroup's creator + // reference; the core ascribes no role semantics to it. + EnsurePodGroup(ctx context.Context, namespace, group, creatorPod string, minCount int32) } // Handler inspects a pod and, when it applies, contributes JSON patch ops. A pod // flows through every registered handler whose Applies returns true; their ops // are concatenated. Applies is fully general — it receives the pod and the -// MutatorAPI, so a handler may consult cluster state (e.g. resolve a group's -// leader) in deciding whether it applies. +// MutatorAPI, so a handler may consult cluster state in deciding whether it +// applies. type Handler interface { Name() string Applies(ctx context.Context, m MutatorAPI, pod *corev1.Pod) bool Mutate(ctx context.Context, m MutatorAPI, pod *corev1.Pod) []spec.Op } +// DefaultHandlerOrder is the active set AND the dispatch order when the operator +// passes no --handlers flag. Order matters: specific handlers run before the +// generic gang fallback, so "gang" is LAST — it applies default gang sizing +// (group-size annotation or owner-derived N) only if no earlier handler already +// shaped the gang. To change the order or disable a handler, pass a different +// list (e.g. --handlers=fluxion,gang drops quantum). +var DefaultHandlerOrder = []string{"fluxion", "quantum", "gang"} + // ── registration ──────────────────────────────────────────────────────────────── // // Handlers self-register via Register() from their package's init(). The core @@ -60,15 +58,57 @@ type Handler interface { // webhook server wiring) is what populates the registry. This keeps the core // domain-agnostic: adding or removing a handler does not touch core code. -var registry []Handler +// available maps a handler's Name() to the handler. Populated by Register() from +// each handler package's init(). This is the set of handlers that EXIST; which +// ones actually run, and in what order, is decided by activeOrder. +var available = map[string]Handler{} + +// activeOrder is the ordered list of handler names to dispatch. It is BOTH the +// selection (names not present are disabled) and the order (dispatch follows the +// slice). Defaults to DefaultHandlerOrder; overridden by SetActiveHandlers. +var activeOrder = append([]string(nil), DefaultHandlerOrder...) -// Register adds a handler to the global set. Called from handler packages' -// init(). Order of registration is the order handlers run. +// Register adds a handler to the available set under its Name(). Called from +// handler packages' init(). func Register(h Handler) { - registry = append(registry, h) + available[h.Name()] = h +} + +// SetActiveHandlers sets the active, ordered handler list (the --handlers value). +// Empty/nil restores DefaultHandlerOrder. Names with no registered handler are +// dropped and returned as `unknown` so the caller can warn. Order is preserved +// exactly as given — the list is the dispatch order. +func SetActiveHandlers(names []string) (active, unknown []string) { + if len(names) == 0 { + activeOrder = append([]string(nil), DefaultHandlerOrder...) + return activeOrder, nil + } + var ordered []string + for _, n := range names { + if _, ok := available[n]; ok { + ordered = append(ordered, n) + } else { + unknown = append(unknown, n) + } + } + activeOrder = ordered + return activeOrder, unknown +} + +// ActiveHandlerNames returns the active dispatch order (for logging at startup). +func ActiveHandlerNames() []string { + return append([]string(nil), activeOrder...) } -// registered returns the registered handlers (the live registry). +// registered returns the active handlers, resolved from activeOrder, in order. +// Names in the order with no registered handler are skipped (already warned at +// SetActiveHandlers time). func registered() []Handler { - return registry + out := make([]Handler, 0, len(activeOrder)) + for _, n := range activeOrder { + if h, ok := available[n]; ok { + out = append(out, h) + } + } + return out } diff --git a/pkg/webhook/handlers/dependency.go b/pkg/webhook/handlers/dependency.go new file mode 100644 index 0000000..d25d598 --- /dev/null +++ b/pkg/webhook/handlers/dependency.go @@ -0,0 +1,131 @@ +package handlers + +import ( + "github.com/converged-computing/fluence/pkg/webhook/spec" + + corev1 "k8s.io/api/core/v1" +) + +// Dependency is Fluence's GENERAL "this set of pods must wait for a producer to +// be ready" primitive. It is deliberately NOT quantum-specific: quantum is the +// first resource type to use it (a gang waits for a quantum submission to reach +// the device queue), but the same primitive applies to any resource type whose +// readiness is produced out-of-band — a license server, a data stage-in job, a +// warmed cache, another gang, etc. +// +// A Dependency has three parts, each carried as a pod annotation so the +// relationship lives at the GROUP level (not duplicated as bespoke per-resource +// fields) and is readable by both the webhook (at admission) and the scheduler +// (in its reconcile loop): +// +// - Kind: what KIND of readiness this is (the resource type's name). The +// producer side knows how to satisfy this kind; the consumer side +// only knows it must wait. Quantum's kind is "quantum-submit". +// - Producer: the identity of the thing that will signal ready. For quantum it +// is the submitter's (base) group; generally it is whatever the +// kind's handler records as the satisfier. +// - Gate: the scheduling gate held on the dependent (consumer) pods until +// the producer signals ready. Removing the gate is the "ungate" +// and is performed by whatever observes the producer's readiness +// (the quantum sidecar for kind=quantum-submit; the scheduler's +// reconcile loop for kinds whose readiness is in-cluster, e.g. +// "another gang is Running"). +// +// The webhook PRODUCES a Dependency (gates the consumers, stamps the +// annotations); REMOVING the gate is owned by the observer best placed to see +// the producer's readiness. That split — declare here, observe elsewhere — is +// what keeps the primitive general: a new resource type adds a Kind and an +// observer and reuses the gating/annotation machinery unchanged. +type Dependency struct { + Kind string // resource-type readiness kind, e.g. "quantum-submit" + Producer string // identity of the readiness producer (e.g. the base group) + Gate string // scheduling gate held on dependents until ready +} + +// Dependency annotation keys (stamped on the dependent pods). Generic — no +// quantum in the names, so any resource type reuses them. +const ( + // DependsOnKindAnnotation names the readiness kind the dependent waits for. + DependsOnKindAnnotation = "fluence.flux-framework.org/depends-on-kind" + // DependsOnProducerAnnotation names the producer expected to signal ready. + DependsOnProducerAnnotation = "fluence.flux-framework.org/depends-on-producer" + // DependsOnGateAnnotation records which scheduling gate encodes the wait, so + // an observer knows exactly which gate to remove when the producer is ready. + DependsOnGateAnnotation = "fluence.flux-framework.org/depends-on-gate" +) + +// applyOps gates the dependent pod and stamps the dependency annotations so the +// relationship is self-describing on the pod. It reuses the gate machinery +// (gateWithName) verbatim — the gate is the universal "held until ready" +// mechanism regardless of resource type — so a new Kind costs only its readiness +// observer, not new gating code. +func (d Dependency) applyOps(pod *corev1.Pod) []spec.Op { + ops := gateWithName(pod, d.Gate) + ops = append(ops, annotateOp(pod, DependsOnKindAnnotation, d.Kind)...) + ops = append(ops, annotateOp(pod, DependsOnProducerAnnotation, d.Producer)...) + ops = append(ops, annotateOp(pod, DependsOnGateAnnotation, d.Gate)...) + return ops +} + +// DependencyOf reads a dependent pod's declared Dependency, or ok=false if it +// carries none. The scheduler's reconcile loop and the sidecar use this to learn +// what a gated pod is waiting for without hardcoding a kind. +func DependencyOf(pod *corev1.Pod) (Dependency, bool) { + kind := spec.Annotation(pod, DependsOnKindAnnotation) + if kind == "" { + return Dependency{}, false + } + return Dependency{ + Kind: kind, + Producer: spec.Annotation(pod, DependsOnProducerAnnotation), + Gate: spec.Annotation(pod, DependsOnGateAnnotation), + }, true +} + +// annotateOp adds a single metadata annotation (creating the annotations map if +// the pod has none). The key is JSON-Pointer-escaped so slashes are handled. +func annotateOp(pod *corev1.Pod, key, value string) []spec.Op { + if value == "" { + return nil + } + if pod.Annotations == nil { + return []spec.Op{{ + Op: "add", + Path: "/metadata/annotations", + Value: map[string]string{key: value}, + }} + } + return []spec.Op{{ + Op: "add", + Path: "/metadata/annotations/" + escapeJSONPointer(key), + Value: value, + }} +} + +// gateWithName adds a named scheduling gate (idempotent) and raises priority for +// the held pod, generalizing the quantum gating to ANY gate name so the +// dependency primitive is not tied to the quantum gate. +func gateWithName(pod *corev1.Pod, gateName string) []spec.Op { + for _, g := range pod.Spec.SchedulingGates { + if g.Name == gateName { + return nil + } + } + var ops []spec.Op + gate := corev1.PodSchedulingGate{Name: gateName} + if len(pod.Spec.SchedulingGates) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}) + } + // Gated dependents schedule reliably once ungated only if they outrank other + // pending work; priorityClassName is immutable post-creation so it must be + // set now. Don't override a user's explicit class. spec.priority is cleared + // to null so the priority admission controller recomputes it from the class + // (add-null is valid whether the field is absent, 0, or set). + if pod.Spec.PriorityClassName == "" { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/priorityClassName", Value: QuantumClassicalPriorityClass}) + ops = append(ops, spec.Op{Op: "add", Path: "/spec/priority", EmitNull: true}) + } + return ops +} diff --git a/pkg/webhook/handlers/gang.go b/pkg/webhook/handlers/gang.go index a6c6126..0469c11 100644 --- a/pkg/webhook/handlers/gang.go +++ b/pkg/webhook/handlers/gang.go @@ -2,11 +2,14 @@ package handlers import ( "context" + "log" + "strconv" "github.com/converged-computing/fluence/pkg/webhook" "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func init() { @@ -14,7 +17,7 @@ func init() { } // gangHandler gang-schedules pods that carry the group label: it creates a -// Fluence-owned PodGroup (first pod admitted becomes the recorded leader) and +// Fluence-owned PodGroup and // links every pod to it via spec.schedulingGroup.podGroupName, which is the // field the scheduler gangs by. It knows nothing about quantum — a purely // classical gang is fully handled here, with no sidecar. @@ -28,15 +31,76 @@ func (h *gangHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *co func (h *gangHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { g := webhook.GroupName(pod) - // First pod admitted in the group creates the PodGroup and is recorded as - // the admission-order leader. All pods are linked to the group. - if m.PodGroupLeader(ctx, pod.Namespace, g) == "" { - m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name) - m.RecordLeader(ctx, pod.Namespace, g, pod.Name) - } + // Ensure the group's PodGroup exists with the resolved gang size, and link + // this pod to it. EnsurePodGroup is idempotent (no-ops if the PodGroup + // already exists — e.g. created by an earlier, more specific handler), so we + // call it unconditionally. The gang handler knows nothing about quantum or + // submitters; that is the quantum handler's concern. + // minCount = full gang size N (group-size annotation, else owner-derived); + // see resolveMinCount. + m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, resolveMinCount(ctx, m, pod)) return schedulingGroupOps(pod, g) } +// resolveMinCount determines the gang's atomic-schedule size N: +// 1. explicit group-size annotation -> honor it verbatim. This is the override +// for when minCount must differ from the parent's replica count (e.g. the +// quantum leader/worker split, where the gang's N is expressed directly). +// 2. otherwise derive from the OWNING object: a Flux Operator MiniCluster pod +// is owned by an indexed Job whose parallelism == completions == size == N. +// (The operator sets Parallelism = Completions = MiniCluster.Spec.Size.) +// 3. otherwise default to 1, logged — never silently size a multi-pod gang to 1. +// +// The leader/worker (quantum) split is orthogonal and unchanged: it is driven by +// QuantumResource in the quantum handler. minCount is always the +// FULL gang N regardless of which pods get gated. +func resolveMinCount(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + // 1. explicit override + if pod.Annotations != nil { + if n := pod.Annotations[webhook.GroupSizeAnnotation]; n != "" { + if v, err := strconv.Atoi(n); err == nil && v > 0 { + return int32(v) + } + } + } + // 2. derive from the owning Job's parallelism + if n := ownerJobN(ctx, m, pod); n > 0 { + return n + } + // 3. no signal: a single-pod gang. Log so a missing size on a multi-pod + // workload is visible rather than a silent gang-of-1. + log.Printf("[fluence-webhook] group %s: no group-size annotation and no owning Job parallelism; defaulting minCount=1", webhook.GroupName(pod)) + return 1 +} + +// ownerJobN returns the parallelism (== size N) of the indexed Job that owns the +// pod, or 0 if there is no such owner. The Flux Operator sets a MiniCluster's +// Job Parallelism == Completions == size, so this is the full gang size N. +// Shared by the gang handler (classical: minCount = N) and the quantum handler +// (split: leader group = 1, worker group = N-1). +func ownerJobN(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + c := m.Client() + if c == nil { + return 0 + } + for _, ref := range pod.OwnerReferences { + if ref.Kind != "Job" { + continue + } + job, err := c.BatchV1().Jobs(pod.Namespace).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return 0 + } + if job.Spec.Parallelism != nil && *job.Spec.Parallelism > 0 { + return *job.Spec.Parallelism + } + if job.Spec.Completions != nil && *job.Spec.Completions > 0 { + return *job.Spec.Completions + } + } + return 0 +} + // schedulingGroupOps links a pod to its PodGroup via the native 1.36 field // spec.schedulingGroup.podGroupName. Idempotent if already linked. func schedulingGroupOps(pod *corev1.Pod, group string) []spec.Op { diff --git a/pkg/webhook/handlers/gang_test.go b/pkg/webhook/handlers/gang_test.go new file mode 100644 index 0000000..ac027f8 --- /dev/null +++ b/pkg/webhook/handlers/gang_test.go @@ -0,0 +1,153 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Tests for gang PodGroup minCount: the whole gang (full N) must schedule +// atomically. Regression guard for the bug where every PodGroup was created +// with minCount=1, so a multi-pod gang was "satisfied" by a single pod and the +// rest were stranded (partial placement). +package handlers + +import ( + "context" + "testing" + + "strconv" + + "github.com/converged-computing/fluence/pkg/webhook" + + corev1 "k8s.io/api/core/v1" + + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" +) + +// minCountOf runs the gang handler for the leader pod of a group and returns the +// minCount of the PodGroup the webhook created. +func minCountOf(t *testing.T, pod *corev1.Pod) int32 { + t.Helper() + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + m.Mutate(context.Background(), pod) + pg, err := m.Clientset.SchedulingV1alpha2(). + PodGroups(pod.Namespace).Get(context.Background(), webhook.GroupName(pod), metav1.GetOptions{}) + if err != nil { + t.Fatalf("PodGroup not created: %v", err) + } + if pg.Spec.SchedulingPolicy.Gang == nil { + t.Fatal("PodGroup has no gang scheduling policy") + } + return pg.Spec.SchedulingPolicy.Gang.MinCount +} + +// minCountWithClient runs the gang handler with a pre-seeded clientset (so the +// owning Job exists) and returns the created PodGroup's minCount. +func minCountWithClient(t *testing.T, pod *corev1.Pod, objs ...interface{}) int32 { + t.Helper() + cs := fake.NewSimpleClientset(toRuntime(objs)...) + m := &webhook.Mutator{Clientset: cs} + m.Mutate(context.Background(), pod) + pg, err := cs.SchedulingV1alpha2().PodGroups(pod.Namespace). + Get(context.Background(), webhook.GroupName(pod), metav1.GetOptions{}) + if err != nil { + t.Fatalf("PodGroup not created: %v", err) + } + return pg.Spec.SchedulingPolicy.Gang.MinCount +} + +func jobWithParallelism(ns, name string, n int32) *batchv1.Job { + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &n, Completions: &n}, + } +} + +func ownedBy(pod *corev1.Pod, kind, name string) { + pod.OwnerReferences = append(pod.OwnerReferences, + metav1.OwnerReference{Kind: kind, Name: name}) +} + +// No annotation, but the pod is owned by an indexed Job with parallelism N +// (the Flux Operator MiniCluster case: Parallelism == Completions == size == N). +// minCount must come from the Job. +func TestGangMinCountDerivedFromOwningJob(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "mc-gang"} + ownedBy(pod, "Job", "mc-gang-job") + got := minCountWithClient(t, pod, jobWithParallelism("default", "mc-gang-job", 4)) + if got != 4 { + t.Errorf("owner-derived: minCount=%d, want 4 (from Job parallelism)", got) + } +} + +// The explicit annotation OVERRIDES the owning Job's parallelism (the override +// exists precisely because minCount may differ from the parent replica count). +func TestGangMinCountAnnotationOverridesOwner(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "ovr-gang"} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: "2"} + ownedBy(pod, "Job", "ovr-gang-job") + got := minCountWithClient(t, pod, jobWithParallelism("default", "ovr-gang-job", 8)) + if got != 2 { + t.Errorf("annotation override: minCount=%d, want 2 (annotation wins over Job=8)", got) + } +} + +// A classical gang of size N must get minCount = N so the whole group schedules +// atomically (this is the core multi-gang fix). +func atoi32(s string) int32 { v, _ := strconv.Atoi(s); return int32(v) } + +func toRuntime(objs []interface{}) []runtime.Object { + out := make([]runtime.Object, 0, len(objs)) + for _, o := range objs { + if ro, ok := o.(runtime.Object); ok { + out = append(out, ro) + } + } + return out +} + +func TestGangMinCountEqualsGroupSize(t *testing.T) { + for _, n := range []string{"2", "4", "8"} { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "g-" + n} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: n} + got := minCountOf(t, pod) + want := atoi32(n) + if got != want { + t.Errorf("group-size=%s: minCount=%d, want %d", n, got, want) + } + } +} + +// No group-size annotation -> minCount falls back to 1 (single-pod gang). +func TestGangMinCountDefaultsToOne(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "g-default"} + if got := minCountOf(t, pod); got != 1 { + t.Errorf("absent group-size: minCount=%d, want 1", got) + } +} + +// group-size is the authoritative gang minCount: a workload that sets it to N +// gets minCount=N (the whole gang schedules atomically), regardless of any owner +// replica count. In the gang+submitter model the full workload IS the gang — +// there is no N-1 worker split. +func TestGangMinCountHonorsGroupSize(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "q-gang"} + pod.Annotations = map[string]string{ + webhook.GroupSizeAnnotation: "4", // full gang size + } + if got := minCountOf(t, pod); got != 4 { + t.Errorf("group-size gang: minCount=%d, want 4 (full N)", got) + } +} diff --git a/pkg/webhook/handlers/handlers_test.go b/pkg/webhook/handlers/handlers_test.go index 04d0e02..4931a8a 100644 --- a/pkg/webhook/handlers/handlers_test.go +++ b/pkg/webhook/handlers/handlers_test.go @@ -9,10 +9,7 @@ import ( "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" - schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" ) // ── fixtures ──────────────────────────────────────────────────────────────────── @@ -87,12 +84,12 @@ func hasSidecarOp(ops []spec.Op) bool { for _, op := range ops { switch v := op.Value.(type) { case corev1.Container: - if v.Name == "fluence-sidecar" { + if v.Name == SidecarContainerName { return true } case []corev1.Container: for _, c := range v { - if c.Name == "fluence-sidecar" { + if c.Name == SidecarContainerName { return true } } @@ -127,238 +124,6 @@ func TestMutateSkipsNonFluxion(t *testing.T) { } } -// ── quantum handler: submitter ────────────────────────────────────────────────── - -func TestSingleQuantumGetsInterceptorNoSidecar(t *testing.T) { - m := &webhook.Mutator{AttributeKeys: []string{"region"}} - ops := m.Mutate(context.Background(), qpuPod("fluence")) - names := opEnvNames(ops) - if !contains(names, "FLUXION_BACKEND") { - t.Errorf("want FLUXION_BACKEND, got %v", names) - } - if !contains(names, "PYTHONPATH") || !contains(names, "FLUENCE_POD_UID") { - t.Errorf("want interceptor env (PYTHONPATH, FLUENCE_POD_UID), got %v", names) - } - if hasSidecarOp(ops) { - t.Error("standalone quantum pod should not get a sidecar") - } - if hasGateOp(ops) { - t.Error("standalone quantum pod should not be gated") - } -} - -func TestObserveLabelInjectsSidecar(t *testing.T) { - m := &webhook.Mutator{} - pod := qpuPod("fluence") - pod.Labels = map[string]string{ObserveLabel: "true"} - ops := m.Mutate(context.Background(), pod) - if !hasSidecarOp(ops) { - t.Error("observe-labeled quantum pod should get the sidecar") - } - if hasGateOp(ops) { - t.Error("observe-only pod should not be gated") - } -} - -// ── quantum handler: worker gating ────────────────────────────────────────────── - -func quantumGroupFixture(ns, group, leaderName string) *fake.Clientset { - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leaderName}, - }, - } - leaderPod := qpuPod("fluence") - leaderPod.Name = leaderName - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - return fake.NewSimpleClientset(pg, leaderPod) -} - -func TestClassicalWorkerInQuantumGroupIsGated(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-leader" - m := &webhook.Mutator{Clientset: quantumGroupFixture(ns, group, leader)} - - worker := cpuPod("fluence") - worker.Name = "qaoa-worker-0" - worker.Namespace = ns - worker.Labels = map[string]string{webhook.GroupLabel: group} - - ops := m.Mutate(context.Background(), worker) - if !hasGateOp(ops) { - t.Errorf("classical worker in a quantum group should be gated; ops=%v", ops) - } - if hasSidecarOp(ops) { - t.Error("worker should not get a sidecar") - } -} - -func TestClassicalGangWorkerNotGated(t *testing.T) { - ns, group, leader := "default", "classical", "classical-leader" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leader}}, - } - leaderPod := cpuPod("fluence") - leaderPod.Name = leader - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg, leaderPod)} - - worker := cpuPod("fluence") - worker.Name = "classical-worker-0" - worker.Namespace = ns - worker.Labels = map[string]string{webhook.GroupLabel: group} - - if hasGateOp(m.Mutate(context.Background(), worker)) { - t.Error("worker in a classical gang must NOT be gated (would deadlock)") - } -} - -// Pod-template gang: every pod requests QPU; only the recorded leader gets the -// sidecar, the rest are gated workers (role by admission order, not request). -func TestPodTemplateGangSecondPodIsWorker(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-abc123" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leader}}, - } - leaderPod := qpuPod("fluence") - leaderPod.Name = leader - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg, leaderPod)} - - second := qpuPod("fluence") // identical spec, requests QPU - second.Name = "qaoa-def456" - second.Namespace = ns - second.Labels = map[string]string{webhook.GroupLabel: group} - - ops := m.Mutate(context.Background(), second) - if !hasGateOp(ops) { - t.Error("second pod in a pod-template gang must be gated as a worker") - } - if hasSidecarOp(ops) { - t.Error("second pod must NOT get a sidecar (it is a worker)") - } -} - -// ── quantum handler: explicit role annotation ────────────────────────────────── -// -// These cover the fluence.flux-framework.org/role annotation, which makes the -// leader/worker split EXPLICIT rather than inferred by admission order. When the -// annotation is present it is authoritative; the same value is echoed to the -// container as FLUENCE_ROLE so the app reads the role Fluence used. - -// roledQPUPod is a QPU-requesting pod in a group with an explicit role. -func roledQPUPod(ns, group, name, role string) *corev1.Pod { - p := qpuPod("fluence") - p.Name = name - p.Namespace = ns - p.Labels = map[string]string{webhook.GroupLabel: group} - p.Annotations = map[string]string{webhook.RoleAnnotation: role} - return p -} - -// An explicitly-declared leader gets the sidecar and is NOT gated — even though -// no leader is recorded on the PodGroup (admission order never consulted). -func TestExplicitLeaderGetsSidecarNotGated(t *testing.T) { - ns, group := "default", "qaoa" - // fixture with NO LeaderAnnotation recorded — proves we don't rely on it. - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns}, - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - leader := roledQPUPod(ns, group, "qaoa-leader", RoleLeader) - ops := m.Mutate(context.Background(), leader) - if hasGateOp(ops) { - t.Error("explicit leader must NOT be gated") - } - if !hasSidecarOp(ops) { - t.Error("explicit leader must get the sidecar") - } - if !contains(opEnvNames(ops), "FLUENCE_ROLE") { - t.Error("leader must get FLUENCE_ROLE injected for the app to read") - } -} - -// An explicitly-declared worker is gated and gets no sidecar — even if it -// requests the QPU resource itself and even if it (wrongly) appears as the -// recorded leader. The annotation overrides both. -func TestExplicitWorkerIsGatedRegardlessOfAdmission(t *testing.T) { - ns, group := "default", "qaoa" - // Adversarial fixture: record THIS worker's own name as the admission-order - // leader. The explicit role:worker must still win and gate it. - worker := roledQPUPod(ns, group, "qaoa-worker-0", RoleWorker) - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: worker.Name}}, - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - ops := m.Mutate(context.Background(), worker) - if !hasGateOp(ops) { - t.Error("explicit worker must be gated even if mis-recorded as the admission-order leader") - } - if hasSidecarOp(ops) { - t.Error("explicit worker must NOT get a sidecar") - } - if !contains(opEnvNames(ops), "FLUENCE_ROLE") { - t.Error("worker must get FLUENCE_ROLE injected so the app knows it is a worker") - } -} - -// A heterogeneous gang declared with explicit roles resolves to exactly one -// leader (sidecar, ungated) and the rest workers (gated) — independent of the -// order in which the webhook admits the pods. This is the property a -// leader/worker quantum gang needs and that admission order cannot guarantee. -func TestExplicitRolesResolveRegardlessOfOrder(t *testing.T) { - ns, group := "default", "qaoa" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns}, // no recorded leader - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - pods := []*corev1.Pod{ - roledQPUPod(ns, group, "w0", RoleWorker), - roledQPUPod(ns, group, "leader", RoleLeader), - roledQPUPod(ns, group, "w1", RoleWorker), - } - var leaders, workers int - for _, p := range pods { // any admission order - ops := m.Mutate(context.Background(), p) - switch { - case hasSidecarOp(ops) && !hasGateOp(ops): - leaders++ - case hasGateOp(ops) && !hasSidecarOp(ops): - workers++ - default: - t.Fatalf("pod %s resolved to neither a clean leader nor worker", p.Name) - } - } - if leaders != 1 || workers != 2 { - t.Fatalf("want 1 leader + 2 workers, got %d leaders / %d workers", leaders, workers) - } -} - -// Backwards compatibility: with NO role annotation, the leader is still chosen -// by admission order (the recorded PodGroup leader), exactly as before. -func TestNoRoleAnnotationFallsBackToAdmissionOrder(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-leader" - m := &webhook.Mutator{Clientset: quantumGroupFixture(ns, group, leader)} - - // a second pod with no role annotation, not the recorded leader -> worker - second := qpuPod("fluence") - second.Name = "qaoa-second" - second.Namespace = ns - second.Labels = map[string]string{webhook.GroupLabel: group} - if !hasGateOp(m.Mutate(context.Background(), second)) { - t.Error("without a role annotation, a non-leader group member must be gated by admission order") - } -} - // ── gang handler: scheduling group linkage ────────────────────────────────────── func TestGangStampsSchedulingGroup(t *testing.T) { diff --git a/pkg/webhook/handlers/quantum.go b/pkg/webhook/handlers/quantum.go index 97fbfa6..47e1714 100644 --- a/pkg/webhook/handlers/quantum.go +++ b/pkg/webhook/handlers/quantum.go @@ -4,11 +4,16 @@ import ( "context" "fmt" "log" + "os" + "strconv" + "strings" "github.com/converged-computing/fluence/pkg/webhook" "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -16,188 +21,560 @@ func init() { webhook.Register(&quantumHandler{}) } -// Quantum-specific policy. The webhook core knows NONE of these — they live -// only here, in the quantum handler. +// Quantum-specific policy. The webhook core knows NONE of these — they live only +// here, in the quantum handler. +// +// Model (no leader/worker): a workload requesting the quantum resource (Job, +// Deployment, or loose pods — the trigger is the resource, not the kind) becomes +// a GANG of full size N: one PodGroup, every pod fully gated and raised to a +// preempting priority, each staged with the interceptor in FAUX mode (the submit +// is a no-op). Fluence ALSO creates a separate one-off SUBMITTER pod — a +// group-of-one running the SAME application container plus the real sidecar — +// which submits the quantum task for real, tags it, stamps the resulting job-id +// onto the gang, and ungates the gang. There is no leader among the user's pods; +// the submitter is the only submitting pod and Fluence owns it. const ( - // QuantumResource is the Fluxion resource a pod requests when it wants - // Fluence to schedule quantum work. Requesting it is the trigger for sidecar - // + interceptor injection. + // QuantumResource is the Fluxion resource a pod requests to ask Fluence to + // schedule quantum work. Requesting it is the sole trigger for this handler. QuantumResource = "fluxion.flux-framework.org/qpu" - // QuantumGate holds a classical worker until the leader's quantum task is - // ready (the sidecar removes it). + // QuantumGate holds a gang pod unscheduled until the submitter's task is + // ready (the submitter's sidecar removes it). QuantumGate = "quantum.braket/ready" - // ObserveLabel opts a standalone quantum pod into observe-only telemetry: - // the sidecar is injected and polls queue position but ungates nothing. + // ObserveLabel opts a STANDALONE quantum pod (a group of one) into + // observe-only telemetry: the sidecar is injected and polls queue position + // but ungates nothing. ObserveLabel = "fluence.flux-framework.org/observe" - // Role values for webhook.RoleAnnotation. - RoleLeader = "leader" - RoleWorker = "worker" + // DependencyKindQuantumSubmit is the readiness Kind for the quantum resource + // type: gang pods wait for a quantum submission to reach the device queue. + // First concrete instance of the general Dependency primitive (dependency.go). + DependencyKindQuantumSubmit = "quantum-submit" + + // SubmitterAnnotation marks the Fluence-created submitter pod so its own + // admission is recognized (real sidecar, real submit, not gated) instead of + // being treated as another gang member. + SubmitterAnnotation = "fluence.flux-framework.org/submitter" + + // GangGroupAnnotation, set on the submitter at creation, names the gang group + // the submitter must ungate. Surfaced to its sidecar as FLUENCE_GANG_GROUP. + GangGroupAnnotation = "fluence.flux-framework.org/gang-group" + + // SubmitterGroupSuffix: the submitter is its own group-of-one named + // -submitter (a distinct PodGroup, minCount 1, so it schedules alone + // and never deadlocks against the gated gang). + SubmitterGroupSuffix = "-submitter" + + // GangGroupEnv tells the submitter's sidecar which gang group label to list + // and ungate when the task is ready. + GangGroupEnv = "FLUENCE_GANG_GROUP" ) -// quantumHandler coordinates quantum-classical workflows. It applies to a pod -// in either role: -// - the quantum submitter (requests QuantumResource): inject the interceptor, -// plus the sidecar when there is coordination to do (group leader, or -// observe-only telemetry requested); -// - a classical worker (a non-leader member of a group whose leader is a -// quantum pod): gate it until the leader's task is ready. -// -// This is the only place in the webhook that knows about quantum resources, -// gates, or observe semantics. +// quantumHandler creates, for a quantum workload, a fully-gated faux-submitting +// gang plus a one-off real submitter (see the package-level model comment). It +// is the only place in the webhook that knows about quantum resources, gates, +// submitters, or observe semantics. type quantumHandler struct{} func (h *quantumHandler) Name() string { return "quantum" } +// Applies to any pod requesting the quantum resource. Gang members run the same +// image as the submitter and request it; the submitter (a copy) requests it; a +// standalone quantum pod requests it. Nothing without the resource needs quantum +// handling, so this is the single, unambiguous trigger. func (h *quantumHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { - if spec.PodRequestsResource(pod, QuantumResource) { - return true + return spec.PodRequestsResource(pod, QuantumResource) +} + +func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { + // The Fluence-created submitter: real interceptor + real sidecar, its own + // group-of-one, NOT gated. Recognized by the marker set at creation. + if spec.Annotation(pod, SubmitterAnnotation) == "true" { + return h.mutateSubmitter(ctx, m, pod) + } + + g := resolveGroup(pod) + observe := spec.Label(pod, ObserveLabel) == "true" + n := resolveGangSize(ctx, m, pod, g) + + // Standalone quantum pod (a group of one): it performs its own real submit. + // No gang, no gating, no faux, no separate submitter. The sidecar is added + // only for observe-only telemetry. + if g == "" || n <= 1 { + ops := interceptorOps(pod) + if observe { + sc := sidecarFor(m) + sc.EnsureRBAC(ctx, pod.Namespace) + ops = append(ops, sc.ContainerOps(pod, true, nil)...) + } + log.Printf("[fluence-webhook] quantum standalone %s/%s (observe=%v)", pod.Namespace, pod.Name, observe) + return ops } - // An explicitly-declared worker applies (so it gets gated) even if it - // doesn't request the quantum resource and the leader isn't recorded yet — - // this removes the admission-order race for explicitly-roled gangs. - if webhook.Role(pod) == RoleWorker && webhook.GroupName(pod) != "" { - return true + + // Gang member: full gang of N in one PodGroup, fully gated + preempting + // priority + faux interceptor. Fluence also ensures the one-off submitter + // (idempotent) that does the real submit and ungates this gang. + m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, n) + ensureSubmitterPod(ctx, m, pod, g) + + ops := linkGroupOps(pod, g) + // Express the wait as the GENERAL dependency primitive: this gang pod depends + // on the quantum submission produced by -submitter, held by the quantum + // gate. applyOps gates the pod, raises priority, and stamps depends-on-*. + dep := Dependency{Kind: DependencyKindQuantumSubmit, Producer: g + SubmitterGroupSuffix, Gate: QuantumGate} + ops = append(ops, dep.applyOps(pod)...) + // Same interceptor as the submitter, but FAUX mode so the gang pod never + // resubmits; it receives the real task id via FLUENCE_QUANTUM_JOB_ID. + ops = append(ops, interceptorOps(pod)...) + ops = append(ops, fauxSubmitEnvOps(pod)...) + log.Printf("[fluence-webhook] quantum gang member %s/%s — group %s minCount=%d, gated+faux", + pod.Namespace, pod.Name, g, n) + return ops +} + +// mutateSubmitter wires the Fluence-created submitter pod: its own PodGroup of +// one, the real interceptor (tag mode), RBAC, and the sidecar container told +// which gang group to ungate (FLUENCE_GANG_GROUP). The submitter is never gated. +func (h *quantumHandler) mutateSubmitter(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { + sg := webhook.GroupName(pod) // the submitter's own group: -submitter + gang := spec.Annotation(pod, GangGroupAnnotation) + if sg != "" { + m.EnsurePodGroup(ctx, pod.Namespace, sg, pod.Name, 1) } - return h.isWorkerOfQuantumGroup(ctx, m, pod) + sc := sidecarFor(m) + ops := sc.InterceptorOps(pod) + sc.EnsureRBAC(ctx, pod.Namespace) + extra := []corev1.EnvVar{{Name: GangGroupEnv, Value: gang}} + ops = append(ops, sc.ContainerOps(pod, false, extra)...) + log.Printf("[fluence-webhook] quantum submitter %s/%s — group %s (ungates gang %q)", + pod.Namespace, pod.Name, sg, gang) + return ops } -// isWorkerOfQuantumGroup reports whether pod is a non-leader member of a group -// whose recorded leader is a quantum (QuantumResource-requesting) pod. Workers -// are classical and do not request the resource themselves, so their role is a -// property of group membership, resolved against cluster state. -func (h *quantumHandler) isWorkerOfQuantumGroup(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { - g := webhook.GroupName(pod) - if g == "" || m.Client() == nil { - return false +// resolveGroup returns the gang group identity: the explicit group label, else +// the owning controller's name (Job/ReplicaSet/StatefulSet — a Deployment's pods +// are owned by a ReplicaSet), else "" (a loose quantum pod with no group, which +// is treated as a standalone group of one). +func resolveGroup(pod *corev1.Pod) string { + if g := webhook.GroupName(pod); g != "" { + return g + } + for _, ref := range pod.OwnerReferences { + switch ref.Kind { + case "Job", "ReplicaSet", "StatefulSet": + return ref.Name + } } - leader := m.PodGroupLeader(ctx, pod.Namespace, g) - if leader == "" || leader == pod.Name { - return false + return "" +} + +// resolveGangSize returns the full gang size N: the explicit group-size +// annotation (authoritative override), else the owner's replica count (Job +// parallelism/completions, ReplicaSet replicas), else a count of pods already +// carrying the group label (best-effort for loose grouped pods; admission-order +// dependent, which is why the annotation is preferred), else 1. +func resolveGangSize(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) int32 { + if pod.Annotations != nil { + if v, err := strconv.Atoi(pod.Annotations[webhook.GroupSizeAnnotation]); err == nil && v > 0 { + return int32(v) + } + } + if n := ownerJobN(ctx, m, pod); n > 0 { + return n + } + if n := ownerReplicaSetN(ctx, m, pod); n > 0 { + return n + } + if group != "" { + if n := countGroupPods(ctx, m, pod.Namespace, group); n > 0 { + return n + } } - lp, err := m.Client().CoreV1().Pods(pod.Namespace).Get(ctx, leader, metav1.GetOptions{}) + return 1 +} + +// ownerReplicaSetN returns the replica count of the ReplicaSet that owns the pod +// (the Deployment case: Deployment -> ReplicaSet -> Pod), or 0 if none. +func ownerReplicaSetN(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + c := m.Client() + if c == nil { + return 0 + } + for _, ref := range pod.OwnerReferences { + if ref.Kind != "ReplicaSet" { + continue + } + rs, err := c.AppsV1().ReplicaSets(pod.Namespace).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return 0 + } + if rs.Spec.Replicas != nil && *rs.Spec.Replicas > 0 { + return *rs.Spec.Replicas + } + } + return 0 +} + +// countGroupPods counts pods already carrying the group label (best-effort gang +// size for loose grouped pods that have neither a group-size annotation nor an +// owning controller). Admission-order dependent — prefer the group-size +// annotation when the exact size must be guaranteed. +func countGroupPods(ctx context.Context, m webhook.MutatorAPI, namespace, group string) int32 { + c := m.Client() + if c == nil { + return 0 + } + list, err := c.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: webhook.GroupLabel + "=" + group, + }) if err != nil { - return false + return 0 } - return spec.PodRequestsResource(lp, QuantumResource) + return int32(len(list.Items)) } -func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { - g := webhook.GroupName(pod) - - // Determine role. An explicit role annotation is AUTHORITATIVE: the workload - // declares which pod leads and which wait, and Fluence honors it directly — - // no admission-order race, and the same value is echoed to the app as - // FLUENCE_ROLE so the webhook's notion of leader and the application's notion - // cannot disagree. When the annotation is absent, fall back to the legacy - // behavior: role is decided by admission order (the first pod admitted in the - // group, recorded on the PodGroup by the gang handler). The admission-order - // path suits a homogeneous pod-template gang where every pod is identical; - // the explicit annotation suits a heterogeneous leader/worker gang. - role := webhook.Role(pod) - var isWorker bool - switch role { - case RoleWorker: - isWorker = true - case RoleLeader: - isWorker = false - default: - if g != "" { - leader := m.PodGroupLeader(ctx, pod.Namespace, g) - isWorker = leader != "" && leader != pod.Name - } - } - - if g != "" && isWorker { - log.Printf("[fluence-webhook] quantum worker %s/%s (role=%q) — gating", - pod.Namespace, pod.Name, role) - ops := gateOps(pod) - ops = append(ops, roleEnvOps(pod, RoleWorker)...) - return ops +// SubmitterPodSuffix names the Fluence-created submitter for a group: +// -submitter. It also serves as the submitter's own PodGroup name. +const SubmitterPodSuffix = SubmitterGroupSuffix + +// ensureSubmitterPod creates the one-off quantum submitter pod for a group +// (idempotent create-if-absent — a client side-effect of admission, like +// EnsurePodGroup/EnsureSidecarRBAC; NOT a separate controller). It is built from +// the admitted gang pod so it runs the SAME application + credentials, is its own +// group-of-one (-submitter), is marked the submitter (so its admission +// gets the real sidecar and is not gated), and records the gang group it must +// ungate. An ownerReference to the gang's PodGroup cascades GC: when the gang +// PodGroup is deleted (gang completed/deleted), the submitter is collected too. +func ensureSubmitterPod(ctx context.Context, m webhook.MutatorAPI, gangPod *corev1.Pod, group string) { + c := m.Client() + if c == nil { + return + } + name := group + SubmitterGroupSuffix + if _, err := c.CoreV1().Pods(gangPod.Namespace).Get(ctx, name, metav1.GetOptions{}); err == nil { + return // already created (idempotent) } + // Clean copy of the user's application: same containers (image, env, creds, + // the quantum resource request) and app volumes — none of the gang's gating + // or faux wiring. + src := gangPod.DeepCopy() + submitter := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: gangPod.Namespace, + Labels: map[string]string{webhook.GroupLabel: name}, + Annotations: map[string]string{ + SubmitterAnnotation: "true", + GangGroupAnnotation: group, + }, + }, + Spec: corev1.PodSpec{ + SchedulerName: webhook.SchedulerName, + RestartPolicy: corev1.RestartPolicyNever, + Containers: src.Spec.Containers, + Volumes: src.Spec.Volumes, + }, + } + // Cascade GC: own the submitter by the gang's PodGroup (created moments ago by + // the caller). Best-effort — only set when the PodGroup UID is known (it is on + // a real cluster; the fake client in tests may leave it empty, in which case + // we skip the ref rather than emit an invalid one). + if pg, err := c.SchedulingV1alpha2().PodGroups(gangPod.Namespace).Get(ctx, group, metav1.GetOptions{}); err == nil && pg.UID != "" { + submitter.OwnerReferences = []metav1.OwnerReference{{ + APIVersion: "scheduling.k8s.io/v1alpha2", + Kind: "PodGroup", + Name: group, + UID: pg.UID, + }} + } + if _, err := c.CoreV1().Pods(gangPod.Namespace).Create(ctx, submitter, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] submitter pod %s/%s: %v", gangPod.Namespace, name, err) + } else { + log.Printf("[fluence-webhook] created submitter pod %s/%s for gang %s", gangPod.Namespace, name, group) + } +} - // Submitter/leader role: recorded or declared group leader, or a standalone - // quantum pod. Always gets the interceptor (so its task is tagged). It gets - // the SIDECAR only when there is coordination to do: it is a group leader - // (workers to ungate), or observe-only telemetry is requested. - isLeader := g != "" - observe := spec.Label(pod, ObserveLabel) == "true" +// linkGroupOps ensures the gang pod carries the group label (so the submitter's +// sidecar can list it) and is linked to the gang PodGroup via +// spec.schedulingGroup.podGroupName. Idempotent. +func linkGroupOps(pod *corev1.Pod, group string) []spec.Op { + var ops []spec.Op + if webhook.GroupName(pod) != group { + if pod.Labels == nil { + ops = append(ops, spec.Op{Op: "add", Path: "/metadata/labels", + Value: map[string]string{webhook.GroupLabel: group}}) + } else { + ops = append(ops, spec.Op{Op: "add", + Path: "/metadata/labels/" + escapeJSONPointer(webhook.GroupLabel), + Value: group}) + } + } + if pod.Spec.SchedulingGroup == nil || pod.Spec.SchedulingGroup.PodGroupName == nil || + *pod.Spec.SchedulingGroup.PodGroupName != group { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGroup", + Value: map[string]string{"podGroupName": group}}) + } + return ops +} - log.Printf("[fluence-webhook] quantum pod %s/%s — interceptor (leader=%v role=%q observe=%v)", - pod.Namespace, pod.Name, isLeader, role, observe) +// escapeJSONPointer escapes "~" and "/" for use in a JSON Pointer path segment. +func escapeJSONPointer(s string) string { + s = strings.ReplaceAll(s, "~", "~0") + s = strings.ReplaceAll(s, "/", "~1") + return s +} - ops := m.InterceptorOps(pod) - ops = append(ops, roleEnvOps(pod, RoleLeader)...) - if isLeader || observe { - m.EnsureSidecarRBAC(ctx, pod.Namespace) - ops = append(ops, m.SidecarContainerOps(pod, observe)...) +const QuantumClassicalPriorityClass = "fluence-quantum-classical" + +// ── faux-submit (worker submit dedup) ─────────────────────────────────────────── +// +// Quantum-specific, and delivered through the SAME Python interceptor as the +// submitter — not a second mechanism. The submitter's interceptor tags the +// submit; the worker's interceptor (same staged code) no-ops the submit. Which +// behavior runs is selected at runtime by FLUENCE_FAUX_SUBMIT, set here on the +// worker. Workers run the submitter's image and may call submit, but by ungate +// time the task already exists, so resubmitting would duplicate it N times. + +const ( + // FauxSubmitEnv selects the interceptor's no-op (faux) mode on workers. + // install_interceptor (see python/fluence/providers/braket.py) reads it and + // patches the vendor submit to return the existing task instead of submitting. + FauxSubmitEnv = "FLUENCE_FAUX_SUBMIT" + + // QuantumJobIDAnnotation is the vendor-neutral task id the ungating sidecar + // stamps on each worker (mirrors python/fluence/ungate.py JOB_ID_ANNOTATION), + // BEFORE removing the gate. Surfaced into FLUENCE_QUANTUM_JOB_ID via the + // downward API so the faux interceptor can return a handle to that task. + QuantumJobIDAnnotation = "fluence.flux-framework.org/quantum-job-id" + + // QuantumJobIDEnv is the env the faux interceptor reads for the existing + // task's id. + QuantumJobIDEnv = "FLUENCE_QUANTUM_JOB_ID" +) + +// fauxSubmitEnvOps sets, on each non-sidecar worker container, the faux-mode +// marker (FLUENCE_FAUX_SUBMIT=true) and the existing task's id +// (FLUENCE_QUANTUM_JOB_ID, downward API from the annotation the ungating sidecar +// stamps). The interceptor is staged separately via the shared sidecar +// InterceptorOps path — these env vars only switch its mode and hand it the id. +func fauxSubmitEnvOps(pod *corev1.Pod) []spec.Op { + faux := corev1.EnvVar{Name: FauxSubmitEnv, Value: "true"} + jobID := spec.AnnotationEnv(QuantumJobIDEnv, QuantumJobIDAnnotation) + var ops []spec.Op + for i, c := range pod.Spec.Containers { + if c.Name == SidecarContainerName { + continue + } + if !spec.HasEnv(c, FauxSubmitEnv) { + if len(c.Env) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{faux}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{faux} + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: faux}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, faux) + } + } + if !spec.HasEnv(c, QuantumJobIDEnv) { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: jobID}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, jobID) + } } return ops } -// roleEnvOps injects FLUENCE_ROLE into every (non-sidecar) container so the -// application reads its gang role from the same source of truth the webhook -// used. effectiveRole is what the webhook decided (leader/worker), used only -// when the pod carries no explicit role annotation; when the annotation is -// present we source the value from it via the downward API so the two always -// agree. Unlike InterceptorOps, this is NOT limited to Fluxion-resource -// containers — worker containers do not request the quantum resource but still -// need to know they are workers. -func roleEnvOps(pod *corev1.Pod, effectiveRole string) []spec.Op { - var value corev1.EnvVar - if webhook.Role(pod) != "" { - value = spec.AnnotationEnv("FLUENCE_ROLE", webhook.RoleAnnotation) - } else { - value = corev1.EnvVar{Name: "FLUENCE_ROLE", Value: effectiveRole} +// Sidecar implementation — quantum-owned, NOT core. +// +// The fluence coordination sidecar (its container, name, RBAC, image, and the +// Python interceptor staging) is specific to the quantum integration: it polls a +// vendor queue and ungates workers. None of this belongs on the webhook core, +// which stays domain-agnostic and only exposes generic primitives (Client, +// InjectedEnv, EnsurePodGroup). The core invokes each handler's generic Mutate; +// a handler does its own create/edit side-effects (here: RBAC, ConfigMaps, +// container injection) through the generic client. +// +// These are package-level functions (not methods on the core *Mutator) operating +// on the generic webhook.MutatorAPI. coreSidecar (see sidecar.go) delegates to +// them; a future non-quantum handler that needs a different sidecar supplies its +// own Sidecar implementation and its own container name/image. + +const ( + // SidecarContainerName is the injected sidecar container's name. Owned here + // (not a global core const) because the container is quantum-specific. + SidecarContainerName = "fluence-sidecar" + + // SidecarServiceAccount is the ServiceAccount (and Role/RoleBinding) name the + // sidecar uses to patch pods and read PodGroups. + SidecarServiceAccount = "fluence-sidecar" + + // defaultSidecarImage is used when FLUENCE_SIDECAR_IMAGE is not set. Owned by + // the quantum integration; the deployment may override it via the env var. + defaultSidecarImage = "ghcr.io/converged-computing/fluence-sidecar:latest" + + // StageVolumeName / StageMountPath: the shared emptyDir the init container + // stages the fluence Python package into, mounted into workload containers + // and prepended to PYTHONPATH (Model C delivery). + StageVolumeName = "fluence-pkg" + StageMountPath = "/opt/fluence-staged" +) + +// sidecarImage resolves the sidecar image: the FLUENCE_SIDECAR_IMAGE override +// (deployment config) or the quantum default. Read here so image config is owned +// by the integration that uses it, not the core. +func sidecarImage() string { + if v := os.Getenv("FLUENCE_SIDECAR_IMAGE"); v != "" { + return v + } + return defaultSidecarImage +} + +// ensureSidecarRBAC provisions the per-namespace ServiceAccount/Role/RoleBinding +// the sidecar uses to patch pods and read PodGroups. Idempotent (create-if-absent). +func ensureSidecarRBAC(ctx context.Context, m webhook.MutatorAPI, namespace string) { + c := m.Client() + if c == nil { + return + } + lbl := map[string]string{"app": SidecarServiceAccount} + + if _, err := c.CoreV1().ServiceAccounts(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}} + if _, err := c.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) + } + } + if _, err := c.RbacV1().Roles(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, + Rules: []rbacv1.PolicyRule{ + {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, + {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, + }, + } + if _, err := c.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) + } + } + if _, err := c.RbacV1().RoleBindings(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, + Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, + RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, + } + if _, err := c.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) + } } +} + +// interceptorOps stages the fluence Python package (Model C): an init container +// copies it into a shared emptyDir, mounted into every workload container +// (skipping the sidecar) with PYTHONPATH + FLUENCE_POD_UID, so Python auto-imports +// the interceptor via sitecustomize. Broad mounting is safe (fail-soft when the +// vendor SDK is absent) and is required so a quantum WORKER — which runs the same +// image but does not request the resource — also gets the (faux-mode) interceptor. +func interceptorOps(pod *corev1.Pod) []spec.Op { var ops []spec.Op + + vol := corev1.Volume{Name: StageVolumeName, VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} + if len(pod.Spec.Volumes) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes/-", Value: vol}) + } + + initc := corev1.Container{ + Name: "fluence-stage", + Image: sidecarImage(), + ImagePullPolicy: corev1.PullAlways, + Command: []string{"sh", "-c", + fmt.Sprintf("python -m fluence.stage %s || echo '[fluence] staging skipped (interceptor unavailable)'", StageMountPath)}, + VolumeMounts: []corev1.VolumeMount{{Name: StageVolumeName, MountPath: StageMountPath}}, + } + if len(pod.Spec.InitContainers) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers", Value: []corev1.Container{initc}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers/-", Value: initc}) + } + + mount := corev1.VolumeMount{Name: StageVolumeName, MountPath: StageMountPath, ReadOnly: true} + pythonpath := corev1.EnvVar{Name: "PYTHONPATH", Value: StageMountPath} + uid := spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid") for i, c := range pod.Spec.Containers { - if c.Name == "fluence-sidecar" || spec.HasEnv(c, "FLUENCE_ROLE") { + if c.Name == SidecarContainerName { continue } - if len(c.Env) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{value}}) + if len(c.VolumeMounts) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: value}) + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) + } + if !spec.HasEnv(c, "PYTHONPATH") { + if len(c.Env) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{pythonpath}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonpath} + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: pythonpath}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonpath) + } + } + if !spec.HasEnv(c, "FLUENCE_POD_UID") { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) } - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, value) } return ops } -// gateOps adds the quantum scheduling gate (idempotent). -const QuantumClassicalPriorityClass = "fluence-quantum-classical" - -func gateOps(pod *corev1.Pod) []spec.Op { - for _, g := range pod.Spec.SchedulingGates { - if g.Name == QuantumGate { - return nil +// sidecarContainerOps adds the fluence sidecar container (pod identity env, the +// generic FLUXION_* contract from InjectedEnv, the observe flag, handler-supplied +// extraEnv, and the workload's secret/configMap-sourced credentials) and sets the +// sidecar ServiceAccount. observe=true selects observe-only telemetry mode. +func sidecarContainerOps(m webhook.MutatorAPI, pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { + var ops []spec.Op + env := []corev1.EnvVar{ + spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid"), + spec.FieldEnv("FLUENCE_POD_NAME", "metadata.name"), + spec.FieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), + spec.FieldEnv("FLUENCE_GROUP", "metadata.labels['"+webhook.GroupLabel+"']"), + } + env = append(env, m.InjectedEnv()...) + if observe { + env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) + } + env = append(env, extraEnv...) + // Copy the workload container's secret/configMap-sourced env onto the sidecar + // so it can talk to the same backend (domain-agnostic: we propagate whatever + // the workload pulls from a secret/configMap; existing FLUENCE_/FLUXION_ names + // are not overwritten). + if len(pod.Spec.Containers) > 0 { + have := map[string]bool{} + for _, e := range env { + have[e.Name] = true + } + for _, e := range pod.Spec.Containers[0].Env { + if have[e.Name] || e.ValueFrom == nil { + continue + } + if e.ValueFrom.SecretKeyRef != nil || e.ValueFrom.ConfigMapKeyRef != nil { + env = append(env, e) + } } } - var ops []spec.Op - gate := corev1.PodSchedulingGate{Name: QuantumGate} - if len(pod.Spec.SchedulingGates) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}) + sidecar := corev1.Container{ + Name: SidecarContainerName, Image: sidecarImage(), ImagePullPolicy: corev1.PullAlways, + Env: env, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), corev1.ResourceMemory: resource.MustParse("256Mi"), + }}, + } + if len(pod.Spec.Containers) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}) - } - // Give gated classical workers a raised priority so they schedule reliably - // once ungated. priorityClassName is immutable post-creation, so it MUST be - // set here at admission, not at ungate time. Only set it if the pod doesn't - // already declare one (don't overwrite a user's class). - if pod.Spec.PriorityClassName == "" { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/priorityClassName", Value: QuantumClassicalPriorityClass}) - // Clear spec.priority so the priority admission controller recomputes it - // from the class. The controller errors only when spec.priority is - // non-nil AND differs from the class value; setting it to null avoids - // that in every case. We use add-with-null (not remove): a JSON Patch - // "remove" of an absent path is a hard error, and whether the API has - // already defaulted spec.priority differs across clusters/k8s versions - // (it broke in CI but not on GKE, or vice versa). add-null is valid - // whether the field is absent, 0, or set. - ops = append(ops, spec.Op{Op: "add", Path: "/spec/priority", EmitNull: true}) + ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers/-", Value: sidecar}) + } + if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) } return ops } diff --git a/pkg/webhook/handlers/quantum_test.go b/pkg/webhook/handlers/quantum_test.go new file mode 100644 index 0000000..613724d --- /dev/null +++ b/pkg/webhook/handlers/quantum_test.go @@ -0,0 +1,448 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// quantum_test.go — all tests for the quantum handler: the gang + submitter +// model, faux-submit, the sidecar wiring, the Dependency primitive, and the +// standalone/observe paths. Shared fixtures (qpuPod, cpuPod, op helpers) live in +// handlers_test.go. +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// ── standalone / observe ──────────────────────────────────────────────────────── + +func TestSingleQuantumGetsInterceptorNoSidecar(t *testing.T) { + m := &webhook.Mutator{AttributeKeys: []string{"region"}} + ops := m.Mutate(context.Background(), qpuPod("fluence")) + names := opEnvNames(ops) + if !contains(names, "FLUXION_BACKEND") { + t.Errorf("want FLUXION_BACKEND, got %v", names) + } + if !contains(names, "PYTHONPATH") || !contains(names, "FLUENCE_POD_UID") { + t.Errorf("want interceptor env (PYTHONPATH, FLUENCE_POD_UID), got %v", names) + } + if hasSidecarOp(ops) { + t.Error("standalone quantum pod should not get a sidecar") + } + if hasGateOp(ops) { + t.Error("standalone quantum pod should not be gated") + } +} + +func TestObserveLabelInjectsSidecar(t *testing.T) { + m := &webhook.Mutator{} + pod := qpuPod("fluence") + pod.Labels = map[string]string{ObserveLabel: "true"} + ops := m.Mutate(context.Background(), pod) + if !hasSidecarOp(ops) { + t.Error("observe-labeled quantum pod should get the sidecar") + } + if hasGateOp(ops) { + t.Error("observe-only pod should not be gated") + } +} + +// ── gang + submitter ──────────────────────────────────────────────────────────── + +// gangQPUPod is a quantum workload pod (requests the resource) in a group, +// owned by a Job of parallelism N — the common real shape (a MiniCluster / +// indexed Job). No role annotation: the new model has no leader/worker. +func gangQPUPod(ns, group, name, job string) *corev1.Pod { + p := qpuPod("fluence") + p.Name = name + p.Namespace = ns + p.Labels = map[string]string{webhook.GroupLabel: group} + p.OwnerReferences = []metav1.OwnerReference{{Kind: "Job", Name: job}} + return p +} + +// mincount returns the gang minCount of the named PodGroup, or ok=false. +func mincount(t *testing.T, cs *fake.Clientset, ns, group string) (int32, bool) { + t.Helper() + pg, err := cs.SchedulingV1alpha2().PodGroups(ns).Get(context.Background(), group, metav1.GetOptions{}) + if err != nil || pg.Spec.SchedulingPolicy.Gang == nil { + return 0, false + } + return pg.Spec.SchedulingPolicy.Gang.MinCount, true +} + +// A quantum gang member (owned by Job parallelism=3) is gated + faux, its gang +// PodGroup is minCount 3 (full N — no N-1 split), and Fluence creates the +// separate -submitter pod. It gets NO sidecar (it is gated). +func TestQuantumGangGatedFauxAndSubmitterCreated(t *testing.T) { + ns, group, job := "default", "qg", "qg-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "qg-0", job)) + + if !hasGateOp(ops) { + t.Error("gang member must be gated") + } + if hasSidecarOp(ops) { + t.Error("gang member (gated) must NOT get a sidecar") + } + if e, ok := envOp(ops, FauxSubmitEnv); !ok || e.Value != "true" { + t.Errorf("gang member must get %s=true", FauxSubmitEnv) + } + if mc, ok := mincount(t, cs, ns, group); !ok || mc != 3 { + t.Errorf("gang PodGroup minCount=%d (ok=%v), want 3 (full N, no split)", mc, ok) + } + // No -workers subgroup in the new model. + if _, ok := mincount(t, cs, ns, group+"-workers"); ok { + t.Error("there must be no -workers subgroup in the gang+submitter model") + } + // Fluence created the submitter. + sub, err := cs.CoreV1().Pods(ns).Get(context.Background(), group+SubmitterGroupSuffix, metav1.GetOptions{}) + if err != nil { + t.Fatalf("submitter pod not created: %v", err) + } + if sub.Annotations[SubmitterAnnotation] != "true" { + t.Error("submitter must carry the submitter marker") + } + if sub.Annotations[GangGroupAnnotation] != group { + t.Errorf("submitter gang-group=%q, want %q", sub.Annotations[GangGroupAnnotation], group) + } + if len(sub.Spec.SchedulingGates) != 0 { + t.Error("submitter must NOT be gated") + } +} + +// The submitter pod, on its own admission, is wired as the real coordinator: its +// own PodGroup minCount 1, the real sidecar (not faux), not gated, and told which +// gang to ungate via FLUENCE_GANG_GROUP. +func TestSubmitterWiredAsRealSidecar(t *testing.T) { + ns, group, job := "default", "qg2", "qg2-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + // First a gang member, which creates the submitter. + m.Mutate(context.Background(), gangQPUPod(ns, group, "qg2-0", job)) + sub, err := cs.CoreV1().Pods(ns).Get(context.Background(), group+SubmitterGroupSuffix, metav1.GetOptions{}) + if err != nil { + t.Fatalf("submitter not created: %v", err) + } + + ops := m.Mutate(context.Background(), sub) + if !hasSidecarOp(ops) { + t.Error("submitter must get the real sidecar") + } + if hasGateOp(ops) { + t.Error("submitter must NOT be gated") + } + if _, ok := envOp(ops, FauxSubmitEnv); ok { + t.Error("submitter must NOT be in faux mode") + } + // FLUENCE_GANG_GROUP is on the sidecar container itself. + var sidecar *corev1.Container + for _, op := range ops { + if c, ok := op.Value.(corev1.Container); ok && c.Name == SidecarContainerName { + cc := c + sidecar = &cc + } + } + if sidecar == nil { + t.Fatal("no sidecar container on submitter") + } + var gotGang bool + for _, e := range sidecar.Env { + if e.Name == GangGroupEnv && e.Value == group { + gotGang = true + } + } + if !gotGang { + t.Errorf("submitter sidecar must get %s=%q", GangGroupEnv, group) + } + if mc, ok := mincount(t, cs, ns, group+SubmitterGroupSuffix); !ok || mc != 1 { + t.Errorf("submitter PodGroup minCount=%d (ok=%v), want 1", mc, ok) + } +} + +// A standalone quantum pod (no group, no owner → group of one) does its own real +// submit: interceptor staged, but no gating, no faux, and no separate submitter. +func TestStandaloneQuantumIsRealNoSubmitter(t *testing.T) { + ns := "default" + cs := fake.NewSimpleClientset() + m := &webhook.Mutator{Clientset: cs} + + pod := qpuPod("fluence") + pod.Name = "solo" + pod.Namespace = ns + + ops := m.Mutate(context.Background(), pod) + if hasGateOp(ops) { + t.Error("standalone quantum pod must not be gated") + } + if _, ok := envOp(ops, FauxSubmitEnv); ok { + t.Error("standalone quantum pod must not be faux") + } + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Error("standalone quantum pod must not spawn a submitter") + } +} + +// ── faux-submit + dependency ──────────────────────────────────────────────────── + +// envValueFrom returns the env var op with the given name, if present (covers +// both single-EnvVar and []EnvVar op shapes). +func envOp(ops []spec.Op, name string) (corev1.EnvVar, bool) { + for _, op := range ops { + switch v := op.Value.(type) { + case corev1.EnvVar: + if v.Name == name { + return v, true + } + case []corev1.EnvVar: + for _, e := range v { + if e.Name == name { + return e, true + } + } + } + } + return corev1.EnvVar{}, false +} + +// annotationOps collects all annotation key=value pairs the ops would stamp. +func annotationOps(ops []spec.Op) map[string]string { + out := map[string]string{} + for _, op := range ops { + // whole-map add: /metadata/annotations + if op.Path == "/metadata/annotations" { + if m, ok := op.Value.(map[string]string); ok { + for k, v := range m { + out[k] = v + } + } + continue + } + // single-key add: /metadata/annotations/ -> string value + const pfx = "/metadata/annotations/" + if len(op.Path) > len(pfx) && op.Path[:len(pfx)] == pfx { + if s, ok := op.Value.(string); ok { + key := unescapeJSONPointer(op.Path[len(pfx):]) + out[key] = s + } + } + } + return out +} + +// unescapeJSONPointer reverses escapeJSONPointer for assertion readability. +func unescapeJSONPointer(s string) string { + // reverse order of escape: ~1 -> /, then ~0 -> ~ + out := "" + for i := 0; i < len(s); i++ { + if s[i] == '~' && i+1 < len(s) { + switch s[i+1] { + case '1': + out += "/" + i++ + continue + case '0': + out += "~" + i++ + continue + } + } + out += string(s[i]) + } + return out +} + +// A quantum worker (no group-size of its own) is expressed as a general +// Dependency: gated, stamped with depends-on-{kind,producer,gate}, and the +// producer is the base group. +func TestQuantumWorkerIsGeneralDependency(t *testing.T) { + ns, group, job := "default", "depq", "depq-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "depq-0", job)) + + if !hasGateOp(ops) { + t.Errorf("worker not gated by the dependency (ops: %+v)", ops) + } + ann := annotationOps(ops) + if ann[DependsOnKindAnnotation] != DependencyKindQuantumSubmit { + t.Errorf("depends-on-kind=%q, want %q", ann[DependsOnKindAnnotation], DependencyKindQuantumSubmit) + } + if ann[DependsOnProducerAnnotation] != group+SubmitterGroupSuffix { + t.Errorf("depends-on-producer=%q, want %q (the submitter group)", ann[DependsOnProducerAnnotation], group+SubmitterGroupSuffix) + } + if ann[DependsOnGateAnnotation] != QuantumGate { + t.Errorf("depends-on-gate=%q, want %q", ann[DependsOnGateAnnotation], QuantumGate) + } +} + +// DependencyOf round-trips the stamped annotations back into a Dependency, so a +// scheduler/sidecar observer can read what a gated pod waits for. +func TestDependencyOfRoundTrip(t *testing.T) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + DependsOnKindAnnotation: DependencyKindQuantumSubmit, + DependsOnProducerAnnotation: "grp", + DependsOnGateAnnotation: QuantumGate, + }}} + d, ok := DependencyOf(pod) + if !ok || d.Kind != DependencyKindQuantumSubmit || d.Producer != "grp" || d.Gate != QuantumGate { + t.Errorf("DependencyOf=%+v ok=%v, want quantum-submit/grp/%s", d, ok, QuantumGate) + } + if _, ok := DependencyOf(&corev1.Pod{}); ok { + t.Errorf("DependencyOf on a pod with no dependency should be ok=false") + } +} + +// The worker is staged with the SAME interceptor as the submitter (PYTHONPATH + +// FLUENCE_POD_UID), put into faux mode (FLUENCE_FAUX_SUBMIT=true), and handed the +// existing task id via the FLUENCE_QUANTUM_JOB_ID downward-API env. One +// mechanism, two modes — no separate ConfigMap shim. The user sets nothing. +func TestQuantumWorkerStagedWithFauxSubmit(t *testing.T) { + ns, group, job := "default", "fauxq", "fauxq-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "fauxq-0", job)) + + // Same interceptor staging as the submitter (PYTHONPATH set on the worker). + if _, ok := envOp(ops, "PYTHONPATH"); !ok { + t.Errorf("worker not staged with the interceptor (no PYTHONPATH); ops: %+v", ops) + } + + // Faux mode selected. + if e, ok := envOp(ops, FauxSubmitEnv); !ok || e.Value != "true" { + t.Errorf("worker missing %s=true (got %+v, ok=%v)", FauxSubmitEnv, e, ok) + } + + // Existing task id sourced from the annotation the ungating sidecar stamps. + e, ok := envOp(ops, QuantumJobIDEnv) + if !ok { + t.Fatalf("worker missing %s env", QuantumJobIDEnv) + } + if e.ValueFrom == nil || e.ValueFrom.FieldRef == nil || + e.ValueFrom.FieldRef.FieldPath != "metadata.annotations['"+QuantumJobIDAnnotation+"']" { + t.Errorf("%s should be a downward-API ref to %s, got %+v", QuantumJobIDEnv, QuantumJobIDAnnotation, e) + } +} + +// Classical override below the replica count: group-size=2 on a gang owned by a +// Job(parallelism=5) must yield minCount=2 (the override), not 5. With a cluster +// sized to 2, the gang reaches quorum and runs; if the override were dropped the +// gang would wait forever for 5 (the e2e hang that fails CI). +func TestClassicalOverrideBelowReplicaCount(t *testing.T) { + ns, group, job := "default", "ovr2", "ovr2-job" + pod := cpuPod("fluence") + pod.Namespace = ns + pod.Labels = map[string]string{webhook.GroupLabel: group} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: "2"} + ownedBy(pod, "Job", job) + + got := minCountWithClient(t, pod, jobWithParallelism(ns, job, 5)) + if got != 2 { + t.Errorf("override below replicas: minCount=%d, want 2 (override wins over Job=5)", got) + } +} + +// ── sidecar wiring ────────────────────────────────────────────────────────────── + +// The sidecar inherits the workload's secret/configMap-sourced credentials so it +// can talk to the same backend, but NOT plain-value env. (Moved from the core +// webhook package: sidecar construction is now quantum-owned.) +func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + pod := &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "gang", + Env: []corev1.EnvVar{ + {Name: "GANG_ROLE", Value: "leader"}, // plain value: NOT copied + {Name: "AWS_ACCESS_KEY_ID", ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "aws-braket-credentials"}, + Key: "AWS_ACCESS_KEY_ID", + }}}, + }, + }}, + }, + } + ops := sidecarContainerOps(m, pod, false, nil) + var sidecar *corev1.Container + for _, op := range ops { + if c, ok := op.Value.(corev1.Container); ok && c.Name == SidecarContainerName { + sidecar = &c + } + } + if sidecar == nil { + t.Fatal("no sidecar container added") + } + var gotSecret, gotPlain bool + for _, e := range sidecar.Env { + if e.Name == "AWS_ACCESS_KEY_ID" && e.ValueFrom != nil && e.ValueFrom.SecretKeyRef != nil { + gotSecret = true + } + if e.Name == "GANG_ROLE" { + gotPlain = true + } + } + if !gotSecret { + t.Error("sidecar should inherit the workload's secret-sourced AWS creds") + } + if gotPlain { + t.Error("sidecar should NOT copy plain-value workload env like GANG_ROLE") + } +} + +// A plain quantum workload pod (no role, owned by a Job of N>1) is gated as a +// faux gang member AND triggers creation of the one-off submitter. The user +// authors no submitter and no roles. +func TestGangMemberTriggersSubmitter(t *testing.T) { + ns, group, job := "default", "qauto", "qauto-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + workload := gangQPUPod(ns, group, "qauto-0", job) + ops := m.Mutate(context.Background(), workload) + + if !hasGateOp(ops) { + t.Error("gang member must be gated") + } + if _, ok := envOp(ops, FauxSubmitEnv); !ok { + t.Error("gang member must get FLUENCE_FAUX_SUBMIT") + } + sub, err := cs.CoreV1().Pods(ns).Get(context.Background(), group+SubmitterGroupSuffix, metav1.GetOptions{}) + if err != nil { + t.Fatalf("submitter pod not created: %v", err) + } + if !spec.PodRequestsResource(sub, QuantumResource) { + t.Error("submitter must request the quantum resource (it runs the real submit)") + } +} diff --git a/pkg/webhook/handlers/registry_test.go b/pkg/webhook/handlers/registry_test.go new file mode 100644 index 0000000..346d786 --- /dev/null +++ b/pkg/webhook/handlers/registry_test.go @@ -0,0 +1,82 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Registry behavior: dispatch order comes from the active handler list (not a +// per-handler Order), and the list both selects and orders handlers. +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + "k8s.io/client-go/kubernetes/fake" +) + +// The default active order ships gang LAST so it only applies default gang +// sizing when no earlier handler shaped the gang. +func TestDefaultOrderGangLast(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + active, _ := webhook.SetActiveHandlers(nil) // restore + read default + if len(active) == 0 { + t.Fatal("no active handlers") + } + if active[len(active)-1] != "gang" { + t.Errorf("gang must be last in default order; got %v", active) + } + // default order is exactly fluxion, quantum, gang + want := []string{"fluxion", "quantum", "gang"} + if len(active) != len(want) { + t.Fatalf("default order = %v, want %v", active, want) + } + for i := range want { + if active[i] != want[i] { + t.Errorf("default order = %v, want %v", active, want) + break + } + } +} + +// The active list IS the order: passing a custom order reorders dispatch, and +// unknown names are reported, not silently kept. +func TestActiveListSetsOrderAndReportsUnknown(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + active, unknown := webhook.SetActiveHandlers([]string{"gang", "fluxion", "bogus"}) + if len(active) != 2 || active[0] != "gang" || active[1] != "fluxion" { + t.Errorf("active = %v, want [gang fluxion] in that order", active) + } + if len(unknown) != 1 || unknown[0] != "bogus" { + t.Errorf("unknown = %v, want [bogus]", unknown) + } +} + +// Dropping a handler from the list disables it: a quantum pod with quantum +// omitted gets no interceptor ops (only fluxion/gang act). +func TestOmittedHandlerDoesNotDispatch(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + + webhook.SetActiveHandlers(nil) // default: quantum present + if !hasInterceptor(m.Mutate(context.Background(), qpuPod("fluence"))) { + t.Fatal("with quantum active, expected interceptor (init container) ops") + } + + webhook.SetActiveHandlers([]string{"fluxion", "gang"}) // quantum omitted + if hasInterceptor(m.Mutate(context.Background(), qpuPod("fluence"))) { + t.Error("with quantum omitted, interceptor ops must NOT be present") + } +} + +func hasInterceptor(ops []spec.Op) bool { + for _, op := range ops { + if op.Path == "/spec/initContainers" || op.Path == "/spec/initContainers/-" { + return true + } + } + return false +} diff --git a/pkg/webhook/handlers/sidecar.go b/pkg/webhook/handlers/sidecar.go new file mode 100644 index 0000000..d105a7c --- /dev/null +++ b/pkg/webhook/handlers/sidecar.go @@ -0,0 +1,57 @@ +package handlers + +import ( + "context" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + corev1 "k8s.io/api/core/v1" +) + +// Sidecar is the capability a handler uses to attach a coordination sidecar to a +// pod. It is NOT part of the webhook core's MutatorAPI: only handlers that need +// a sidecar (today, quantum) depend on it, and a handler may supply its own +// implementation to customize delivery. The default implementation +// (coreSidecar) delegates to the webhook core's interceptor/sidecar ops, which +// remain the staging mechanism shared by any sidecar-using handler. +// +// This is the seam your design calls for: "a general sidecar interface that can +// be used across handlers and customized by the quantum [handler]". A future +// custom-resource handler can implement Sidecar differently (different image, +// env, gating) without touching the core or other handlers. +type Sidecar interface { + // EnsureRBAC provisions the per-namespace ServiceAccount/Role/Binding the + // sidecar needs to read/patch pods and podgroups. + EnsureRBAC(ctx context.Context, namespace string) + // InterceptorOps stages the in-pod interceptor (Model C) into the workload + // containers (init container + shared volume on PYTHONPATH). + InterceptorOps(pod *corev1.Pod) []spec.Op + // ContainerOps adds the sidecar container. observe=true selects observe-only + // telemetry mode (no ungating). extraEnv carries handler-computed, + // domain-specific env (e.g. the quantum handler's FLUENCE_EXPECTED_WORKERS = + // N-1 and FLUENCE_WORKER_GROUP_BASE) so the core never has to know about + // leader/worker concepts — the handler that owns the split owns those values. + ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op +} + +// coreSidecar is the default Sidecar. It delegates to the quantum-owned sidecar +// implementation (see sidecar_impl.go), which uses only the generic MutatorAPI +// (Client, InjectedEnv). The webhook core no longer carries any sidecar logic; a +// custom handler could supply its own Sidecar with a different container/image. +type coreSidecar struct{ m webhook.MutatorAPI } + +func (s coreSidecar) EnsureRBAC(ctx context.Context, namespace string) { + ensureSidecarRBAC(ctx, s.m, namespace) +} +func (s coreSidecar) InterceptorOps(pod *corev1.Pod) []spec.Op { + return interceptorOps(pod) +} +func (s coreSidecar) ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { + return sidecarContainerOps(s.m, pod, observe, extraEnv) +} + +// sidecarFor returns the Sidecar a handler should use. Centralized so the choice +// of implementation (and any future per-handler customization) lives in one +// place. Today every sidecar-using handler gets the core-backed default. +func sidecarFor(m webhook.MutatorAPI) Sidecar { return coreSidecar{m: m} } diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 20a7288..b39bec1 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -1,11 +1,11 @@ // Package webhook is fluence's mutating admission webhook. // // The core here is domain-agnostic plumbing: it owns the Mutator, the handler -// dispatcher, per-namespace PodGroup/RBAC provisioning, the Model C package -// staging (init container + shared volume on PYTHONPATH), the HTTP entrypoint, -// and self-managed TLS. It knows nothing about quantum, Braket, gate names, or -// observe labels — that policy lives entirely in the handlers (pkg/webhook/ -// handlers), which self-register via Register(). +// dispatcher, per-namespace PodGroup provisioning, the HTTP entrypoint, and +// self-managed TLS. It knows nothing about quantum, Braket, gate names, sidecars, +// RBAC, or interceptor staging — that policy and machinery lives entirely in the +// handlers (pkg/webhook/handlers), which self-register via Register() and perform +// their own create/edit side-effects through the generic MutatorAPI. // // The webhook self-manages TLS via a self-signed CA patched into the // MutatingWebhookConfiguration caBundle at startup. @@ -32,9 +32,7 @@ import ( admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -52,34 +50,12 @@ const ( // meaning to it (a handler decides what a group means). GroupLabel = "fluence.flux-framework.org/group" - // LeaderAnnotation records the admission-order leader on a PodGroup. - LeaderAnnotation = "fluence.flux-framework.org/leader" - - // RoleAnnotation, set by the workload on each pod, explicitly declares the - // pod's gang role ("leader" or "worker"). When present it is AUTHORITATIVE: - // the quantum handler gates workers and gives the leader the sidecar based - // on this value, instead of inferring the leader by admission order. The - // same value is injected into the container env as FLUENCE_ROLE so the - // application reads its role from the same source of truth Fluence used. - // When absent, role falls back to admission order (backwards compatible). - RoleAnnotation = "fluence.flux-framework.org/role" - - // ExpectedWorkersAnnotation, set by the workload on the leader pod, tells the - // sidecar how many gated workers to wait for before ungating. The count is - // known at admission (the workload declares it) even though worker names are - // not, so it travels as a static sidecar env var. The core treats it as an - // opaque string and ascribes no meaning to it beyond propagation. - ExpectedWorkersAnnotation = "fluence.flux-framework.org/expected-workers" - - // Sidecar/staging infrastructure (generic — not quantum-specific). - SidecarImage = "ghcr.io/converged-computing/fluence-sidecar:latest" - SidecarServiceAccount = "fluence-sidecar" - - // StageVolumeName / StageMountPath: the shared emptyDir the init container - // stages the fluence Python package into, mounted into the user container and - // prepended to PYTHONPATH (Model C delivery). - StageVolumeName = "fluence-pkg" - StageMountPath = "/opt/fluence-staged" + // GroupSizeAnnotation is the gang member count N, set by the workload on each + // pod. It is the authoritative override for the PodGroup gang minCount when + // the size cannot (or should not) be derived from the owning controller — and + // for loose grouped pods where counting at admission is unreliable. The core + // treats it as an opaque integer string. + GroupSizeAnnotation = "fluence.flux-framework.org/group-size" ) // ── Mutator ───────────────────────────────────────────────────────────────────── @@ -87,31 +63,14 @@ const ( type Mutator struct { AttributeKeys []string Clientset kubernetes.Interface - SidecarImage string } // compile-time check that *Mutator satisfies the handler capability interface. var _ MutatorAPI = (*Mutator)(nil) -func (m *Mutator) sidecarImage() string { - if m.SidecarImage != "" { - return m.SidecarImage - } - return SidecarImage -} - // GroupName returns the value of GroupLabel on the pod, or "". func GroupName(pod *corev1.Pod) string { return spec.Label(pod, GroupLabel) } -// Role returns the explicit gang role declared on the pod via RoleAnnotation -// ("leader"/"worker"), or "" if unset (caller falls back to admission order). -func Role(pod *corev1.Pod) string { return spec.Annotation(pod, RoleAnnotation) } - -func resourceQuantity(s string) *resource.Quantity { - q := resource.MustParse(s) - return &q -} - // ── MutatorAPI: capabilities exposed to handlers ──────────────────────────────── // Client implements MutatorAPI: returns the Kubernetes client (nil in tests). @@ -138,29 +97,13 @@ func (m *Mutator) EnvVarNames() []string { return names } -// PodGroupLeader returns the recorded admission-order leader for the group, or -// "". Retries briefly to absorb the concurrent leader/worker admission race. -func (m *Mutator) PodGroupLeader(ctx context.Context, namespace, group string) string { - if m.Clientset == nil || group == "" { - return "" - } - for i := 0; i < 3; i++ { - pg, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Get(ctx, group, metav1.GetOptions{}) - if err != nil { - return "" - } - if pg.Annotations != nil && pg.Annotations[LeaderAnnotation] != "" { - return pg.Annotations[LeaderAnnotation] - } - if i < 2 { - time.Sleep(100 * time.Millisecond) - } +// EnsurePodGroup creates a Fluence-owned PodGroup with gang minCount = the full +// gang size N (the whole group schedules atomically) if absent. minCount<=0 +// falls back to 1. +func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string, minCount int32) { + if minCount <= 0 { + minCount = 1 } - return "" -} - -// EnsurePodGroup creates a Fluence-owned PodGroup (minCount:1) if absent. -func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string) { if m.Clientset == nil { return } @@ -179,205 +122,17 @@ func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPo }, Spec: schedulingv1alpha2.PodGroupSpec{ SchedulingPolicy: schedulingv1alpha2.PodGroupSchedulingPolicy{ - Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: 1}, + Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: minCount}, }, }, } if _, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Create(ctx, pg, metav1.CreateOptions{}); err != nil { log.Printf("[fluence-webhook] could not create PodGroup %s/%s: %v", namespace, group, err) } else { - log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=1)", namespace, group) - } -} - -// RecordLeader records leaderPod as the group's admission-order leader. -func (m *Mutator) RecordLeader(ctx context.Context, namespace, group, leaderPod string) { - if m.Clientset == nil || group == "" { - return - } - patch := fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, LeaderAnnotation, leaderPod) - if _, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Patch( - ctx, group, types.MergePatchType, []byte(patch), metav1.PatchOptions{}); err != nil { - log.Printf("[fluence-webhook] could not record leader on PodGroup %s/%s: %v", namespace, group, err) - } -} - -// EnsureSidecarRBAC provisions the per-namespace ServiceAccount/Role/RoleBinding -// the sidecar uses to patch pods and read PodGroups. -func (m *Mutator) EnsureSidecarRBAC(ctx context.Context, namespace string) { - if m.Clientset == nil { - return - } - lbl := map[string]string{"app": "fluence-sidecar"} - - if _, err := m.Clientset.CoreV1().ServiceAccounts(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}} - if _, err := m.Clientset.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } - if _, err := m.Clientset.RbacV1().Roles(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - role := &rbacv1.Role{ - ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, - Rules: []rbacv1.PolicyRule{ - {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, - {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, - }, - } - if _, err := m.Clientset.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } - if _, err := m.Clientset.RbacV1().RoleBindings(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - rb := &rbacv1.RoleBinding{ - ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, - Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, - RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, - } - if _, err := m.Clientset.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) - } + log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=%d)", namespace, group, minCount) } } -// InterceptorOps implements Model C delivery. It injects an init container (the -// sidecar image) that stages the fluence Python package into a shared emptyDir, -// mounts that volume into every Fluxion-resource container, and prepends it to -// PYTHONPATH plus sets FLUENCE_POD_UID. Python auto-imports the staged -// sitecustomize on startup, which runs the interceptor — no user code changes, -// no PYTHONSTARTUP (which only fires interactively), no vendor SDK on our side. -func (m *Mutator) InterceptorOps(pod *corev1.Pod) []spec.Op { - var ops []spec.Op - - // Shared volume. - vol := corev1.Volume{Name: StageVolumeName, VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} - if len(pod.Spec.Volumes) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes/-", Value: vol}) - } - - // Init container that stages the package into the shared volume. - // - // Fail-soft: the interceptor is best-effort, so its delivery must be too. We - // wrap the stage command so a failure (bad image, missing python, package - // problem) leaves the shared volume empty and exits 0 rather than blocking - // the user's pod with Init:Error. An empty staged dir simply means the - // interceptor does not run — the user application is unaffected. (This also - // lets CI use a minimal placeholder sidecar image for placement-only tests.) - initc := corev1.Container{ - Name: "fluence-stage", - Image: m.sidecarImage(), - ImagePullPolicy: corev1.PullAlways, - Command: []string{"sh", "-c", - fmt.Sprintf("python -m fluence.stage %s || echo '[fluence] staging skipped (interceptor unavailable)'", StageMountPath)}, - VolumeMounts: []corev1.VolumeMount{{Name: StageVolumeName, MountPath: StageMountPath}}, - } - if len(pod.Spec.InitContainers) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers", Value: []corev1.Container{initc}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers/-", Value: initc}) - } - - // Mount the staged volume + set PYTHONPATH and FLUENCE_POD_UID on each - // Fluxion-resource container. - mount := corev1.VolumeMount{Name: StageVolumeName, MountPath: StageMountPath, ReadOnly: true} - pythonpath := corev1.EnvVar{Name: "PYTHONPATH", Value: StageMountPath} - uid := spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid") - for i, c := range pod.Spec.Containers { - if !spec.RequestsFluxionResource(c) { - continue - } - if len(c.VolumeMounts) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) - } - if !spec.HasEnv(c, "PYTHONPATH") { - if len(c.Env) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{pythonpath}}) - pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonpath} - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: pythonpath}) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonpath) - } - } - if !spec.HasEnv(c, "FLUENCE_POD_UID") { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) - } - } - return ops -} - -// SidecarContainerOps adds the fluence-sidecar container and sets its -// ServiceAccount. observe=true selects observe-only telemetry mode. -func (m *Mutator) SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op { - var ops []spec.Op - // The sidecar resolves its vendor provider at runtime from the backend the - // scheduler chose. It gets the same FLUXION_* contract as the workload - // containers (FLUXION_BACKEND + attribute vars like FLUXION_VENDOR), sourced - // via the downward API from the scheduler's annotations — so the values - // resolve once the scheduler writes them, after admission. - env := []corev1.EnvVar{ - spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid"), - spec.FieldEnv("FLUENCE_POD_NAME", "metadata.name"), - spec.FieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), - spec.FieldEnv("FLUENCE_GROUP", "metadata.labels['"+GroupLabel+"']"), - } - env = append(env, m.InjectedEnv()...) - if observe { - env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) - } - // The gang size is known at admission (the leader carries it), even though - // the worker NAMES are not yet. Propagate the expected worker count to the - // sidecar as a static env var so it can wait until it has discovered that - // many gated workers before ungating, rather than ungating a partial set. - // Read from a generic annotation so the core stays domain-agnostic; the - // workload manifest sets it (e.g. from its own N_WORKERS). - if pod.Annotations != nil { - if n := pod.Annotations[ExpectedWorkersAnnotation]; n != "" { - env = append(env, corev1.EnvVar{Name: "FLUENCE_EXPECTED_WORKERS", Value: n}) - } - } - // The sidecar talks to the same backend the workload does (e.g. to find the - // task and read its queue position), so it needs the same credentials. Copy - // the workload container's secret/configmap-sourced env onto the sidecar. - // This stays domain-agnostic: we don't know or name the provider's creds, we - // just propagate whatever the workload pulls from a secret/configMap (e.g. - // AWS_*, IBM tokens). Existing FLUENCE_/FLUXION_ names are not overwritten. - if len(pod.Spec.Containers) > 0 { - have := map[string]bool{} - for _, e := range env { - have[e.Name] = true - } - for _, e := range pod.Spec.Containers[0].Env { - if have[e.Name] || e.ValueFrom == nil { - continue - } - if e.ValueFrom.SecretKeyRef != nil || e.ValueFrom.ConfigMapKeyRef != nil { - env = append(env, e) - } - } - } - sidecar := corev1.Container{ - Name: "fluence-sidecar", Image: m.sidecarImage(), ImagePullPolicy: corev1.PullAlways, - Env: env, - Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{ - corev1.ResourceCPU: *resourceQuantity("100m"), corev1.ResourceMemory: *resourceQuantity("256Mi"), - }}, - } - if len(pod.Spec.Containers) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers/-", Value: sidecar}) - } - if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) - } - return ops -} - // ── Dispatcher ────────────────────────────────────────────────────────────────── // Mutate dispatches the pod to every registered handler and concatenates the diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 26983d4..9af6c9c 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -2,8 +2,6 @@ package webhook import ( "testing" - - corev1 "k8s.io/api/core/v1" ) // EnvVarNames returns the FLUXION_* contract names (used by the scheduler plugin @@ -22,47 +20,3 @@ func TestEnvVarNames(t *testing.T) { } } } - -func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { - m := &Mutator{} - pod := &corev1.Pod{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{{ - Name: "gang", - Env: []corev1.EnvVar{ - {Name: "GANG_ROLE", Value: "leader"}, // plain value: NOT copied - {Name: "AWS_ACCESS_KEY_ID", ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: "aws-braket-credentials"}, - Key: "AWS_ACCESS_KEY_ID", - }}}, - }, - }}, - }, - } - ops := m.SidecarContainerOps(pod, false) - var sidecar *corev1.Container - for _, op := range ops { - if c, ok := op.Value.(corev1.Container); ok && c.Name == "fluence-sidecar" { - sidecar = &c - } - } - if sidecar == nil { - t.Fatal("no sidecar container added") - } - var gotSecret, gotPlain bool - for _, e := range sidecar.Env { - if e.Name == "AWS_ACCESS_KEY_ID" && e.ValueFrom != nil && e.ValueFrom.SecretKeyRef != nil { - gotSecret = true - } - if e.Name == "GANG_ROLE" { - gotPlain = true - } - } - if !gotSecret { - t.Error("sidecar should inherit the workload's secret-sourced AWS creds") - } - if gotPlain { - t.Error("sidecar should NOT copy plain-value workload env like GANG_ROLE") - } -} diff --git a/python/fluence/providers/base.py b/python/fluence/providers/base.py index dca4429..561bca2 100644 --- a/python/fluence/providers/base.py +++ b/python/fluence/providers/base.py @@ -80,7 +80,7 @@ def find_my_task(self, pod_uid: str, backend: str, timeout: int) -> "Task | None raise NotImplementedError def is_ready_to_ungate(self, task: "Task") -> bool: - """True when workers should be ungated — queue position == 1 or the task + """True when the gang should be ungated — queue position == 1 or the task is already RUNNING/terminal. Always implementable.""" raise NotImplementedError @@ -134,4 +134,4 @@ def resolve_from_env() -> "Provider | None": for k, v in os.environ.items(): if k.startswith("FLUXION_"): attrs[k[len("FLUXION_"):].lower()] = v - return resolve(attrs) + return resolve(attrs) \ No newline at end of file diff --git a/python/fluence/providers/braket.py b/python/fluence/providers/braket.py index 23bd9fc..33f1683 100644 --- a/python/fluence/providers/braket.py +++ b/python/fluence/providers/braket.py @@ -49,8 +49,26 @@ def install_interceptor(self, pod_uid: str) -> bool: return False # braket SDK not in this container — fail-soft original_run = AwsDevice.run + faux = os.environ.get("FLUENCE_FAUX_SUBMIT", "").lower() == "true" def patched_run(self, task_specification, *args, **kwargs): + # Two modes of the ONE interceptor: + # faux (worker): the one-off submitter already submitted this task + # before the worker was ungated, so submitting again would + # duplicate it N times. Return a handle to the EXISTING task (by + # ARN, handed over via FLUENCE_QUANTUM_JOB_ID) without submitting. + # tag (submitter): stamp the pod-uid tag so the sidecar can find the + # task in the queue, then submit for real. + if faux: + arn = os.environ.get("FLUENCE_QUANTUM_JOB_ID", "") + if arn: + from braket.aws import AwsQuantumTask + log(f"faux-submit: returning existing task {arn} " + f"(no resubmission)") + return AwsQuantumTask(arn=arn) + log("faux-submit: no job id; suppressing submit " + "(worker consumes results by id)") + return None if pod_uid: tags = kwargs.get("tags", {}) tags[TAG_KEY] = pod_uid @@ -226,4 +244,4 @@ def job_id(self, task: BraketTask) -> str: PROVIDER = BraketProvider() -register(PROVIDER) +register(PROVIDER) \ No newline at end of file diff --git a/python/fluence/sidecar.py b/python/fluence/sidecar.py index 098574b..d0724e5 100644 --- a/python/fluence/sidecar.py +++ b/python/fluence/sidecar.py @@ -1,18 +1,19 @@ """ fluence.sidecar — provider-agnostic quantum coordination sidecar main loop. -Injected by the Fluence webhook into the quantum-submitting pod. Resolves its -vendor at runtime from the backend annotation, discovers the task the user -application submitted (tagged by the interceptor), polls readiness, and either -ungates gated workers (gang mode) or just logs the queue-position series -(observe-only mode). +Injected by the Fluence webhook into the one-off SUBMITTER pod (gang + submitter +model — there is no leader/worker split). Resolves its vendor at runtime from the +backend annotation, discovers the task the user application submitted (tagged by +the interceptor), polls readiness, and either ungates the gated GANG group (gang +mode) or just logs the queue-position series (observe-only mode). Entry point: `fluence-sidecar` console script (see pyproject.toml) -> main(). Environment (injected by the Fluence webhook): FLUENCE_POD_UID UID of this pod (matches interceptor tag) FLUENCE_NAMESPACE Kubernetes namespace - FLUENCE_GATED_PODS comma-separated gated worker names + FLUENCE_GANG_GROUP group label of the gated gang to ungate + FLUENCE_GATED_PODS optional explicit comma-separated gang pod names FLUENCE_OBSERVE "true" for observe-only telemetry mode FLUXION_BACKEND / FLUXION_VENDOR scheduler-chosen backend / vendor FLUENCE_TASK_DISCOVERY_TIMEOUT seconds to wait for discovery (default 300) @@ -30,6 +31,7 @@ from fluence.ungate import ungate_pods, gated_pods_from_env, namespace_from_env, wait_for_gated_pods + def _poll(provider, task, poll_interval, ungate): mode = "gang" if ungate else "observe-only" log(f"{mode} mode: polling queue position") @@ -52,18 +54,22 @@ def main(): pod_uid = os.environ.get("FLUENCE_POD_UID", "") pod_name = os.environ.get("FLUENCE_POD_NAME", "") group = os.environ.get("FLUENCE_GROUP", "") + # Gang + submitter model: this sidecar runs in the one-off SUBMITTER pod + # (its own group-of-one, -submitter). The gated workload it must ungate + # is the GANG group, named by FLUENCE_GANG_GROUP (set by the webhook). There + # is no leader/worker split and no -workers subgroup. + gang_group = os.environ.get("FLUENCE_GANG_GROUP", "") backend = os.environ.get("FLUXION_BACKEND", "") observe = os.environ.get("FLUENCE_OBSERVE", "").lower() == "true" discovery_timeout = int(os.environ.get("FLUENCE_TASK_DISCOVERY_TIMEOUT", 300)) poll_interval = int(os.environ.get("FLUENCE_POLL_INTERVAL", 30)) - expected_workers = int(os.environ.get("FLUENCE_EXPECTED_WORKERS", 0)) ungate_timeout = int(os.environ.get("FLUENCE_UNGATE_TIMEOUT", 120)) namespace = namespace_from_env() - log("starting fluence quantum sidecar") + log("starting fluence quantum submitter sidecar") log(f" pod_uid={pod_uid} namespace={namespace} group={group} " - f"backend={backend} observe={observe} expected_workers={expected_workers}") + f"gang_group={gang_group} backend={backend} observe={observe}") provider = resolve_from_env() if provider is None: @@ -75,8 +81,9 @@ def main(): if task is None: log("ERROR: could not discover quantum task") if not observe: - ungate_pods(wait_for_gated_pods(namespace, group, expected_workers, - exclude=pod_name, timeout=ungate_timeout), + # Fail open: ungate the gang so it is not stranded forever. + ungate_pods(wait_for_gated_pods(namespace, gang_group, exclude=pod_name, + timeout=ungate_timeout), "", namespace) sys.exit(1) @@ -89,19 +96,18 @@ def main(): log("observe-only run complete") return - # Wait until all expected gated workers are present (gang is submitted - # together), then ungate them. expected_workers is N-1, propagated by the - # webhook from the leader at admission; if unset we ungate whatever is found. + # Ungate the gang: discover the gated pods in the gang group and remove their + # gate, stamping the job-id so each can fetch results by id. The gang pods are + # created up front (Job/Deployment), so they are present by submit time. gated_pods = gated_pods_from_env() or wait_for_gated_pods( - namespace, group, expected_workers, exclude=pod_name, - timeout=ungate_timeout) - log(f"ungating {len(gated_pods)} worker(s): {gated_pods}") + namespace, gang_group, exclude=pod_name, timeout=ungate_timeout) + log(f"ungating {len(gated_pods)} gang pod(s): {gated_pods}") n_ok = ungate_pods(gated_pods, job_id, namespace) if n_ok == len(gated_pods): - log(f"done — {n_ok} worker(s) ungated") + log(f"done — {n_ok} gang pod(s) ungated") else: - log(f"WARNING: ungated only {n_ok}/{len(gated_pods)} worker(s) — see errors above") + log(f"WARNING: ungated only {n_ok}/{len(gated_pods)} gang pod(s) — see errors above") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/python/fluence/ungate.py b/python/fluence/ungate.py index 1019ead..a40e662 100644 --- a/python/fluence/ungate.py +++ b/python/fluence/ungate.py @@ -84,10 +84,10 @@ def gated_pods_from_env(): def discover_gated_pods(namespace, group, exclude=""): """ Find the names of pods in the same group that still carry the quantum - scheduling gate (i.e. the workers this sidecar's leader must ungate). + scheduling gate (i.e. the gang pods this submitter must ungate). - The leader's sidecar is created before the workers are admitted, so the gated - set cannot be known at admission time and must be discovered at runtime. We + The submitter is created alongside the gang, so the gated set is discovered + at runtime rather than known at admission. We list pods by the group label and keep those with the QUANTUM_GATE_NAME gate still present, excluding the leader pod itself. """ @@ -114,31 +114,24 @@ def discover_gated_pods(namespace, group, exclude=""): return names -def wait_for_gated_pods(namespace, group, expected, exclude="", timeout=120, - interval=3): +def wait_for_gated_pods(namespace, group, exclude="", timeout=120, interval=3): """ - Wait until at least `expected` gated workers have been discovered in the - group, or `timeout` seconds elapse. The gang is submitted together, so all - workers appear quickly; the timeout is a backstop against a crashed/never- - admitted worker so the sidecar never hangs. Returns the discovered list - (which may be short of `expected` if the timeout fired). + Wait until at least one gated gang pod is discovered in the group (the gang + is created up front, so its pods appear quickly), then return all currently + gated pods. The timeout is a backstop so the submitter never hangs if the + gang never appears. Returns the discovered list (possibly empty on timeout). """ deadline = time.time() + timeout found = [] while time.time() < deadline: found = discover_gated_pods(namespace, group, exclude=exclude) - if expected and len(found) >= expected: - log(f"all {expected} gated worker(s) present") + if found: return found - if not expected: - # No expected count known — return whatever is present now. - return found - log(f"waiting for gated workers: {len(found)}/{expected}") + log("waiting for gated gang pods to appear") time.sleep(interval) - log(f"WARNING: timed out waiting for gated workers " - f"({len(found)}/{expected}); ungating what is present") + log("WARNING: timed out waiting for gated gang pods; none found") return found def namespace_from_env(): - return os.environ.get("FLUENCE_NAMESPACE", "default") + return os.environ.get("FLUENCE_NAMESPACE", "default") \ No newline at end of file diff --git a/test/e2e/02-quantum-placement.sh b/test/e2e/02-quantum-placement.sh deleted file mode 100644 index 17897a3..0000000 --- a/test/e2e/02-quantum-placement.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -# Quantum placement: a qpu pod is matched to a backend and the webhook injects QRMI_BACKEND. -set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" -ANN="fluence.flux-framework.org/backend" - -log "TEST 2: quantum placement and backend handoff" -kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml - -wait_pod_phase sampler-mock Running 120 || fail "sampler-mock did not reach Running" - -# fluence must have stamped the chosen backend annotation. -backend="$(kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" -[ -n "$backend" ] || (show_webhook sampler-mock && fail "backend annotation ($ANN) was not set by fluence") -log "fluence chose backend: $backend" - -# The webhook must have surfaced it as QRMI_BACKEND inside the container. -out="$(kubectl logs sampler-mock || true)" -echo "$out" | grep -q "BACKEND=${backend}" \ - || (show_webhook sampler-mock && fail "QRMI_BACKEND in container ('$out') does not match annotation ($backend)") - -log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as QRMI_BACKEND" -kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/03-restart-recovery.sh b/test/e2e/03-restart-recovery.sh index 20c1be9..c26980f 100644 --- a/test/e2e/03-restart-recovery.sh +++ b/test/e2e/03-restart-recovery.sh @@ -9,7 +9,7 @@ ANN="fluence.flux-framework.org/backend" log "TEST 3: restart does not double-book an exclusive backend" # 1. Schedule the first qpu pod and capture its backend. -kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock.yaml wait_pod_phase sampler-mock "$NS" Running 120 || fail "sampler-mock did not reach Running" backend="$(kubectl get pod sampler-mock -n "$NS" -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" [ -n "$backend" ] || fail "first pod has no backend annotation" @@ -26,7 +26,7 @@ wait_pod_phase sampler-mock "$NS" Running 30 || fail "first pod not Running afte # 4. A second pod requesting the same exclusive qpu must NOT get the same backend. # If recovery worked, the backend is occupied and the second pod stays Pending. -kubectl apply -f examples/test/e2e/quantum-pod-mock-2.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock-2.yaml if assert_stays_pending sampler-mock-2 "$NS" 45; then log "PASS: second qpu pod stayed Pending; backend '$backend' was not double-booked" else @@ -38,5 +38,5 @@ else fi fi -kubectl delete -f examples/test/e2e/quantum-pod-mock-2.yaml --wait=false || true -kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock-2.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/04-sidecar-ungate.sh b/test/e2e/04-sidecar-ungate.sh deleted file mode 100644 index 9ffefc8..0000000 --- a/test/e2e/04-sidecar-ungate.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -# Sidecar webhook test. -# -# Verifies that when a PodGroup of size > 1 with QPU resources is submitted: -# 1. The webhook creates fluence-sidecar RBAC in the namespace automatically -# 2. The leader pod gets the sidecar container injected -# 3. The worker pod gets the quantum.braket/ready scheduling gate added -# 4. The worker pod gets fluence-quantum-classical priority class set -# -# Does NOT test the sidecar itself (task discovery, interceptor, -# queue position polling). Those require real AWS credentials and are covered -# by sidecars/providers/braket/test/integration.sh which is run locally. -set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" - -log "TEST 4: sidecar webhook — RBAC creation, gate injection, sidecar injection" - -kubectl apply -f examples/test/e2e/sidecar-mock-pods.yaml - -# Give webhook time to process the leader pod admission -sleep 3 - -# Print webhook logs — always show these so we can see what happened -log "--- webhook logs ---" -kubectl logs -n kube-system deployment/fluence-webhook --tail=50 || true -log "--- end webhook logs ---" - -# 1. Webhook should have created fluence-sidecar ServiceAccount -log "checking webhook created fluence-sidecar ServiceAccount..." -for i in $(seq 1 30); do - kubectl get serviceaccount fluence-sidecar -n default > /dev/null 2>&1 && break - sleep 2 -done -kubectl get serviceaccount fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar ServiceAccount" -log " fluence-sidecar ServiceAccount created" - -# 2. Webhook should have created fluence-sidecar Role -kubectl get role fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar Role" -log " fluence-sidecar Role created" - -# 3. Webhook should have created fluence-sidecar RoleBinding -kubectl get rolebinding fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar RoleBinding" -log " fluence-sidecar RoleBinding created" - -# 4. Leader pod should have the fluence-stage init container injected (Model C: -# it stages the fluence Python package into a shared volume on PYTHONPATH). -log "checking webhook injected the fluence-stage init container..." -wait_pod_phase sidecar-test-leader Running 120 \ - || { kubectl describe pod sidecar-test-leader; fail "sidecar-test-leader did not reach Running"; } -initc=$(kubectl get pod sidecar-test-leader \ - -o jsonpath='{.spec.initContainers[*].name}') -echo "$initc" | grep -q "fluence-stage" \ - || fail "fluence-stage init container not injected (initContainers: $initc)" -log " fluence-stage init container injected" - -# 5. Leader pod should have the sidecar container injected -log "checking sidecar injected into leader pod..." -containers=$(kubectl get pod sidecar-test-leader \ - -o jsonpath='{.spec.containers[*].name}') -echo "$containers" | grep -q "fluence-sidecar" \ - || fail "fluence-sidecar container not injected into leader (containers: $containers)" -log " fluence-sidecar container injected into leader" - -# 6. Worker pod should have scheduling gate added by webhook -gate=$(kubectl get pod sidecar-test-worker \ - -o jsonpath='{.spec.schedulingGates[0].name}') -[ "$gate" = "quantum.braket/ready" ] \ - || fail "worker pod does not have quantum.braket/ready gate (got: $gate)" -log " quantum.braket/ready gate set on worker" - -# 7. Worker pod should have the fluence-quantum-classical priority class set by -# the webhook at admission (so it schedules reliably once ungated). -pc=$(kubectl get pod sidecar-test-worker -o jsonpath='{.spec.priorityClassName}') -[ "$pc" = "fluence-quantum-classical" ] \ - || fail "worker pod missing fluence-quantum-classical priority class (got: $pc)" -log " fluence-quantum-classical priority class set on worker" - -log "PASS: webhook correctly created RBAC, injected sidecar, gated worker" -log "NOTE: fluence-quantum-classical priority is set by the webhook at admission (immutable post-creation)" -log "NOTE: braket sidecar integration test (SDK intercept, tag discovery," -log " queue polling) is in sidecars/providers/braket/test/integration.sh" - -# Only clean up pods and PodGroup — RBAC is namespace infrastructure -# that persists for future quantum workflows in this namespace -kubectl delete -f examples/test/e2e/sidecar-mock-pods.yaml diff --git a/test/e2e/01-classical-gang.sh b/test/e2e/gang/01-classical-gang.sh old mode 100644 new mode 100755 similarity index 71% rename from test/e2e/01-classical-gang.sh rename to test/e2e/gang/01-classical-gang.sh index d2018ac..1ebfc64 --- a/test/e2e/01-classical-gang.sh +++ b/test/e2e/gang/01-classical-gang.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash # Classical gang scheduling: a PodGroup of 2 must be placed all-or-nothing on real nodes. set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" log "TEST 1: classical gang scheduling" -kubectl apply -f examples/single-podgroup.yaml +kubectl apply -f examples/test/e2e/gang/single-podgroup.yaml # All pods in the 'training' deployment must reach Running (scheduled + started). # Wait for the pod to EXIST before waiting for Ready — kubectl wait errors out @@ -25,5 +25,9 @@ count="$(kubectl get pods -l app=training --no-headers | wc -l | tr -d ' ')" [ "$count" = "1" ] || fail "expected 2 training pods, got $count" log "PASS: classical gang placed all $count pods via fluence" -kubectl delete -f examples/single-podgroup.yaml --wait=false || true +kubectl delete -f examples/test/e2e/gang/single-podgroup.yaml --wait=false || true kubectl patch podgroup training --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +# Wait for the pods to actually be gone before the next test runs — otherwise a +# terminating 'training' pod (same name/labels reused by other scenarios) can be +# misread as the next test's placement. +kubectl wait --for=delete pod -l app=training --timeout=60s 2>/dev/null || true diff --git a/test/e2e/gang/02-postfilter-rematch.sh b/test/e2e/gang/02-postfilter-rematch.sh new file mode 100755 index 0000000..f74c87b --- /dev/null +++ b/test/e2e/gang/02-postfilter-rematch.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# PostFilter re-match: when another scheduler plugin (TaintToleration) rejects a +# node Fluxion allocated, Fluence must abandon that allocation, exclude the node, +# and re-match onto an untainted node. Safety: the gang's RUNNING pod must NEVER +# bind to the tainted node. +# +# This test is self-isolating: it uses its own workload name (pf-rematch) and +# labels, distinct from the other e2e scenarios, and ensures a clean slate first, +# so a pod left over (terminating) from a previous test can never be mistaken for +# this test's placement. It also ignores terminating pods when asserting. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +NAME=pf-rematch +SEL="app=${NAME}" + +log "TEST 5: PostFilter abandons a taint-rejected allocation and re-matches" + +# --- clean slate: no leftover pods from earlier tests under our name ---------- +kubectl delete deployment "$NAME" --ignore-not-found >/dev/null 2>&1 || true +kubectl delete podgroup "$NAME" --ignore-not-found >/dev/null 2>&1 || true +kubectl patch podgroup "$NAME" --type=merge \ + -p '{"metadata":{"finalizers":null}}' >/dev/null 2>&1 || true +kubectl wait --for=delete pod -l "$SEL" --timeout=60s >/dev/null 2>&1 || true +# Defensive: a prior test's workload left running would occupy the only +# untainted worker and make this test fail with a (correct) fluxion +# allocate -1 for lack of capacity. Ensure none lingers. +kubectl delete deployment training --ignore-not-found --wait=false >/dev/null 2>&1 || true +kubectl wait --for=delete pod -l app=training --timeout=60s >/dev/null 2>&1 || true + +TAINTED="$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' \ + -o jsonpath='{.items[0].metadata.name}')" +[ -n "$TAINTED" ] || fail "no worker node found to taint" +log "tainting node $TAINTED with fluence-e2e=blocked:NoSchedule" +kubectl taint nodes "$TAINTED" fluence-e2e=blocked:NoSchedule --overwrite + +cleanup() { + kubectl taint nodes "$TAINTED" fluence-e2e- 2>/dev/null || true + kubectl delete deployment "$NAME" --ignore-not-found --wait=false 2>/dev/null || true + kubectl delete podgroup "$NAME" --ignore-not-found --wait=false 2>/dev/null || true + kubectl patch podgroup "$NAME" --type=merge \ + -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +} +trap cleanup EXIT + +# --- our own workload (distinct name/labels; does NOT tolerate the taint) ------ +kubectl apply -f - <" for empty fields, so an empty deletionTimestamp + # shows as "", NOT "". Treat "" as empty for both columns. + if [ "$deleted" != "" ] && [ -n "$deleted" ]; then continue; fi # skip terminating + if [ "$node" = "" ] || [ -z "$node" ]; then continue; fi # skip not-yet-bound + checked=$((checked+1)) + if [ "$node" = "$TAINTED" ]; then + fail "SAFETY VIOLATION: running pod $name is bound to the tainted node $TAINTED" + fi + log "$name correctly placed on $node (not the tainted $TAINTED)" +done < <(kubectl get pods -l "$SEL" \ + -o custom-columns='N:.metadata.name,NODE:.spec.nodeName,DEL:.metadata.deletionTimestamp' \ + --no-headers) + +[ "$checked" -ge 1 ] || fail "no running ${NAME} pod found to check" + +# Informational: did PostFilter actually fire (Fluxion picked the tainted node +# first and we re-matched), or did Fluxion place on the good node directly? +POD="$(kubectl -n kube-system get pods -l app=fluence \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" +if [ -n "$POD" ] && kubectl -n kube-system logs "$POD" 2>/dev/null \ + | grep -q "unschedulable: abandoning allocation"; then + log "observed PostFilter abandonment in scheduler log (re-match path exercised)" +else + log "note: Fluxion placed on the untainted node directly this run (PostFilter not needed)" +fi + +log "PASS: gang scheduled on an untainted node; no running pod on the tainted node" diff --git a/test/e2e/gang/03-multi-gang.sh b/test/e2e/gang/03-multi-gang.sh new file mode 100755 index 0000000..9f01ae5 --- /dev/null +++ b/test/e2e/gang/03-multi-gang.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Multi-pod gang scheduling on real nodes. Guards the two failures that the +# single-pod 01 test could NOT catch (and that shipped a minCount=1 bug): +# A) a multi-pod gang must place ALL of them (minCount must equal the gang size, not 1) +# B) under contention, a gang that cannot fully fit stays ENTIRELY pending — +# never partially placed (no stranded pods holding nodes). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +# ---- A) all-or-nothing placement of a 3-pod gang ------------------------------- +log "TEST 6A: multi-pod gang (2) places all-or-nothing" +kubectl apply -f examples/test/e2e/gang/multi-gang.yaml + +# the webhook must have created the PodGroup with minCount = 2 (the bug set it to 1) +log "checking PodGroup minCount == 2 (set by webhook from group-size)" +for i in $(seq 1 30); do + mc="$(kubectl get podgroup gang3 -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$mc" ] && break; sleep 2 +done +[ "$mc" = "2" ] || fail "PodGroup gang3 minCount=$mc, want 2 (minCount=1 bug -> partial gangs)" + +log "waiting for all 2 gang pods to be Ready" +wait_pods_ready "app=gang3" 2 180 || fail "gang3 did not place all 2 pods (gang scheduling failed)" + +count="$(kubectl get pods -l app=gang3 --field-selector=status.phase=Running --no-headers | wc -l | tr -d ' ')" +[ "$count" = "2" ] || fail "expected 2 Running gang3 pods, got $count (partial placement)" +for p in $(kubectl get pods -l app=gang3 -o name); do + pod="${p#pod/}" + sched="$(kubectl get pod "$pod" -o jsonpath='{.spec.schedulerName}')" + [ "$sched" = "fluence" ] || fail "$pod not scheduled by fluence (got: $sched)" +done +log "PASS 6A: 2-pod gang placed atomically by fluence (minCount=2)" + +kubectl delete -f examples/test/e2e/gang/multi-gang.yaml --wait=false || true +kubectl patch podgroup gang3 --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +kubectl wait --for=delete pod -l app=gang3 --timeout=60s 2>/dev/null || true + +# ---- B) contention: the gang that can't fully fit stays ENTIRELY pending -------- +log "TEST 6B: contention — a gang that cannot fully fit must NOT partially place" +kubectl apply -f examples/test/e2e/gang/multi-gang-contention.yaml + +# wait until the cluster settles. Three possible outcomes: +# - one gang fully Running, other fully Pending -> contention; assert no partial +# - BOTH fully Running -> runner big enough, no contention to test (skip) +# - any partial (1 of 2 in a gang scheduled) -> the bug, fail +log "waiting for gangs to settle" +winner=""; loser=""; both="" +for i in $(seq 1 90); do + ra="$(kubectl get pods -l app=gang-a --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + rb="$(kubectl get pods -l app=gang-b --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + if [ "$ra" = "2" ] && [ "$rb" = "2" ]; then both=1; break; fi + if [ "$ra" = "2" ] && [ "$rb" = "0" ]; then winner=gang-a; loser=gang-b; break; fi + if [ "$rb" = "2" ] && [ "$ra" = "0" ]; then winner=gang-b; loser=gang-a; break; fi + sleep 2 +done + +if [ -n "$both" ]; then + log "SKIP 6B: cluster placed both gangs (>=4 schedulable cores) — no contention on this runner" +else + [ -n "$winner" ] || fail "no clean settle: gang-a=$ra gang-b=$rb running (possible PARTIAL placement)" + log "winner=$winner (2 running), loser=$loser (expected 0 running)" + # the loser must have ZERO pods scheduled to a node — the all-or-nothing guarantee. + # A single scheduled loser pod = partial placement = the bug. + scheduled_loser="$(kubectl get pods -l app=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" + [ "$scheduled_loser" = "0" ] || fail "$loser has $scheduled_loser pod(s) on a node — PARTIAL placement (gang violated)" + log "PASS 6B: $loser stayed entirely pending — no partial placement under contention" +fi + +kubectl delete -f examples/test/e2e/gang/multi-gang-contention.yaml --wait=false || true +for g in gang-a gang-b; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app=gang-a --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l app=gang-b --timeout=60s 2>/dev/null || true +log "PASS: multi-gang all-or-nothing verified" diff --git a/test/e2e/gang/04-requeue-on-capacity.sh b/test/e2e/gang/04-requeue-on-capacity.sh new file mode 100755 index 0000000..f41aa71 --- /dev/null +++ b/test/e2e/gang/04-requeue-on-capacity.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# Requeue-on-capacity + gang atomicity under contention. +# +# Two 2-pod gangs contend for a cluster that can only run one at a time. This +# guards two invariants that the GKE contention runs exposed: +# 1. ALL-OR-NOTHING: each gang places ALL its pods or NONE — never a partial +# (e.g. 1-of-2 scheduled). The winner must be a clean 2/2; the loser a clean +# 0/2 while it waits. +# 2. REQUEUE: when the winner completes and frees its nodes, the loser is +# re-attempted on its own (no manual nudge) and then ALSO places atomically +# (2/2), driven by the shortened --pod-max-in-unschedulable-pods-duration. +# +# SCOPE / LIMITATION: this is a 3-node kind cluster with small (1-core) pods. It +# verifies the INVARIANTS on a minimal contention case. It does NOT reproduce the +# GKE-scale dynamics where the bug was first seen — one-pod-per-node (~80-core) +# saturation and ~20 simultaneous mixed-size gangs draining in sequence. That +# scale behavior is validated on the real cluster, not in CI; a pass here means +# the invariants hold on the simple case, not that large-scale draining is proven. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +# running-pod count for a gang (job-name label set by the Job controller) +running() { kubectl get pods -l job-name="$1" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' '; } +# count of a gang's pods actually bound to a node (Running OR already Succeeded) +on_nodes() { kubectl get pods -l job-name="$1" -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' 2>/dev/null | grep -c . || true; } + +log "TEST 9: contended gangs stay all-or-nothing, loser requeues when capacity frees" +kubectl apply -f examples/test/e2e/gang/multi-gang-requeue.yaml + +# ---- 1. one gang wins CLEANLY (2/2); the other places NOTHING (0/2) ------------ +log "waiting for a clean 2/0 split (one whole gang runs, the other entirely waits)" +winner=""; loser="" +for i in $(seq 1 60); do + rw="$(running gang-win)"; ra="$(running gang-wait)" + if [ "$rw" = "2" ] && [ "$ra" = "0" ]; then winner=gang-win; loser=gang-wait; break; fi + if [ "$ra" = "2" ] && [ "$rw" = "0" ]; then winner=gang-wait; loser=gang-win; break; fi + # a 1/x or x/1 state that persists is a PARTIAL gang — fail fast on it + if [ "$rw" = "1" ] || [ "$ra" = "1" ]; then + sleep 6 # allow a transient mid-bind moment to resolve + rw="$(running gang-win)"; ra="$(running gang-wait)" + { [ "$rw" = "1" ] || [ "$ra" = "1" ]; } && \ + fail "PARTIAL gang: gang-win=$rw gang-wait=$ra running (all-or-nothing violated)" + fi + sleep 2 +done +[ -n "$winner" ] || fail "no clean 2/0 split (gang-win=$(running gang-win) gang-wait=$(running gang-wait))" +log " winner=$winner (2/2 running), loser=$loser" + +# loser must have ZERO pods on any node — not even one (that would be a partial) +sl="$(on_nodes "$loser")" +[ "$sl" = "0" ] || fail "$loser has $sl pod(s) bound while it should be entirely pending — PARTIAL placement" +log " $loser entirely pending (0 pods bound) — all-or-nothing holds" + +# ---- 2. winner completes -> loser is requeued AND places atomically ------------ +log "waiting for winner=$winner to complete and free its nodes" +kubectl wait --for=condition=complete job/$winner --timeout=120s || fail "$winner did not complete" +log " $winner completed; capacity freed" + +# The loser must now place ALL its pods (2/2), on its own, within a window above +# the 30s recheck flush but below the 5m default — proving the shortened timeout +# is in effect AND that the requeued gang is still atomic (not a partial). +log "asserting $loser requeues and places ATOMICALLY (2/2) within ~75s" +ok="" +for i in $(seq 1 38); do # ~75s + rl="$(running $loser)" + dl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + # both pods accounted for (running and/or already completed) = atomic placement + [ "$((rl + dl))" = "2" ] && { ok=1; break; } + # a lone 1/2 that lingers = partial placement of the requeued gang + if [ "$((rl + dl))" = "1" ]; then + sleep 6 + rl="$(running $loser)"; dl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + [ "$((rl + dl))" = "1" ] && fail "$loser placed 1 of 2 pods — PARTIAL placement of the requeued gang" + fi + sleep 2 +done +[ -n "$ok" ] || fail "$loser did NOT place both pods within 75s of capacity freeing — \ +either the shortened --pod-max-in-unschedulable-pods-duration is not taking effect \ +(gang stuck) or the requeued gang did not assemble" +log "PASS 9: $loser requeued and placed atomically (2/2) after $winner freed capacity" + +kubectl delete -f examples/test/e2e/gang/multi-gang-requeue.yaml --wait=false || true +for g in gang-win gang-wait; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l job-name=gang-win --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l job-name=gang-wait --timeout=60s 2>/dev/null || true diff --git a/test/e2e/lib.sh b/test/e2e/lib.sh index cad6a2e..13390c9 100644 --- a/test/e2e/lib.sh +++ b/test/e2e/lib.sh @@ -44,7 +44,7 @@ wait_fluence_ready() { show_webhook() { pod=$1 - echo "FAIL: QRMI_BACKEND mismatch" + echo "FAIL: FLUXION_BACKEND mismatch" kubectl get pod $pod -o jsonpath='{.spec.containers[0].env}'; echo kubectl get pod $pod -o jsonpath='{.metadata.annotations}'; echo kubectl -n kube-system logs deploy/fluence-webhook --tail=50 diff --git a/test/e2e/quantum/01-quantum-placement.sh b/test/e2e/quantum/01-quantum-placement.sh new file mode 100755 index 0000000..8f5c475 --- /dev/null +++ b/test/e2e/quantum/01-quantum-placement.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Quantum placement: a qpu pod is matched to a backend and the webhook injects FLUXION_BACKEND. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" +ANN="fluence.flux-framework.org/backend" + +log "TEST 2: quantum placement and backend handoff" +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock.yaml + +wait_pod_phase sampler-mock Running 120 || fail "sampler-mock did not reach Running" + +# fluence must have stamped the chosen backend annotation. +backend="$(kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" +[ -n "$backend" ] || (show_webhook sampler-mock && fail "backend annotation ($ANN) was not set by fluence") +log "fluence chose backend: $backend" + +# The webhook must have surfaced it as FLUXION_BACKEND inside the container. +out="$(kubectl logs sampler-mock || true)" +if ! echo "$out" | grep -q "BACKEND=${backend}"; then + # Diagnostic (CI has no interactive shell): show whether the env var is ABSENT + # (not injected -> webhook issue) or PRESENT-BUT-EMPTY (annotation not resolved + # at container start -> delivery/timing issue), and what the container actually got. + log "--- diagnostic: container env spec ---" + kubectl get pod sampler-mock -o jsonpath='{.spec.containers[0].env}' ; echo + log "--- diagnostic: live value via exec ---" + kubectl exec sampler-mock -- sh -c 'echo "FLUXION_BACKEND=[$FLUXION_BACKEND]"' 2>&1 || true + log "--- diagnostic: backend annotation on pod ---" + kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" ; echo + show_webhook sampler-mock + fail "FLUXION_BACKEND in container ('$out') does not match annotation ($backend)" +fi + +log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as FLUXION_BACKEND" +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/quantum/02-sidecar-ungate.sh b/test/e2e/quantum/02-sidecar-ungate.sh new file mode 100755 index 0000000..88f047b --- /dev/null +++ b/test/e2e/quantum/02-sidecar-ungate.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# Gang + submitter webhook test (no leader/worker). +# +# When a quantum workload (a gang of N pods all requesting QPU, no roles) is +# submitted, the webhook must: +# 1. create the fluence-sidecar RBAC in the namespace automatically +# 2. gate every gang pod with quantum.braket/ready +# 3. raise every gang pod to the fluence-quantum-classical priority class +# 4. ADDITIONALLY create the one-off submitter pod -submitter +# 5. inject the fluence-stage init container + the sidecar container into the +# submitter (Model C staging + the real coordinator) +# +# Does NOT test the sidecar runtime (task discovery, interceptor, queue polling) +# — that needs real AWS creds (sidecars/providers/braket/test/integration.sh). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +SUBMITTER=${GROUP}-submitter + +log "TEST 4: gang+submitter webhook — RBAC, gating, priority, submitter creation" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml +sleep 3 + +log "--- webhook logs ---" +kubectl logs -n kube-system deployment/fluence-webhook --tail=50 || true +log "--- end webhook logs ---" + +# 1. RBAC created by the webhook (idempotent, per-namespace). +log "checking webhook created fluence-sidecar RBAC..." +for i in $(seq 1 30); do + kubectl get serviceaccount fluence-sidecar -n default >/dev/null 2>&1 && break + sleep 2 +done +kubectl get serviceaccount fluence-sidecar -n default || fail "no fluence-sidecar ServiceAccount" +kubectl get role fluence-sidecar -n default || fail "no fluence-sidecar Role" +kubectl get rolebinding fluence-sidecar -n default || fail "no fluence-sidecar RoleBinding" +log " RBAC present" + +# 2 + 3. Every gang pod is gated and at the preempting priority class. +for p in ${GROUP}-0 ${GROUP}-1; do + gate="$(kubectl get pod "$p" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" + [ "$gate" = "quantum.braket/ready" ] || fail "$p not gated (gate=$gate)" + pc="$(kubectl get pod "$p" -o jsonpath='{.spec.priorityClassName}' 2>/dev/null || true)" + [ "$pc" = "fluence-quantum-classical" ] || fail "$p priorityClass=$pc, want fluence-quantum-classical" +done +log " gang pods gated + fluence-quantum-classical priority" + +# 4. Fluence created the submitter pod. +log "checking webhook created the submitter pod $SUBMITTER..." +for i in $(seq 1 30); do + kubectl get pod "$SUBMITTER" -n default >/dev/null 2>&1 && break + sleep 2 +done +kubectl get pod "$SUBMITTER" -n default || fail "webhook did not create submitter pod $SUBMITTER" +sub_marker="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/submitter}' 2>/dev/null || true)" +[ "$sub_marker" = "true" ] || fail "submitter missing the submitter marker" +log " submitter pod created" + +# 5. Submitter has the staging init container + the sidecar container, and is NOT gated. +wait_pod_phase "$SUBMITTER" Running 120 \ + || { kubectl describe pod "$SUBMITTER"; fail "$SUBMITTER did not reach Running"; } +initc="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.spec.initContainers[*].name}')" +echo "$initc" | grep -q fluence-stage || fail "fluence-stage init container not injected (init: $initc)" +conts="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.spec.containers[*].name}')" +echo "$conts" | grep -q fluence-sidecar || fail "fluence-sidecar container not injected (containers: $conts)" +sgate="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" +[ -z "$sgate" ] || fail "submitter must NOT be gated (gate=$sgate)" +log " submitter has fluence-stage + fluence-sidecar, not gated" + +log "PASS: webhook gated the gang, set priority, created RBAC + the submitter" +log "NOTE: priority is set at admission (immutable post-creation)" +log "NOTE: braket sidecar runtime (SDK intercept, tag discovery, queue polling)" +log " is in sidecars/providers/braket/test/integration.sh" + +# Clean up pods + PodGroups; RBAC is namespace infra and persists. +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +kubectl delete pod "$SUBMITTER" --wait=false 2>/dev/null || true +for g in "$GROUP" "$SUBMITTER"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/03-gang-submitter.sh b/test/e2e/quantum/03-gang-submitter.sh new file mode 100644 index 0000000..46905ca --- /dev/null +++ b/test/e2e/quantum/03-gang-submitter.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Gang + submitter structure (replaces the old leader/worker split). +# +# The structural guarantee the ungate path depends on: a quantum gang of size N +# is ONE fully-gated PodGroup (minCount N), and Fluence creates a +# SEPARATE submitter pod in its OWN group-of-one -submitter (minCount 1, +# not gated) that does the real submit and ungates the gang. There is no +# -workers subgroup and no leader among the user's pods. (The runtime +# ungate is covered by the braket integration test; here we prove the shape.) +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +SUBMITTER=${GROUP}-submitter + +log "TEST 7: gang(N, gated) + separate submitter(1) structure" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml + +# Gang PodGroup exists with minCount N=2 (full gang, no split). +log "checking gang group '$GROUP' minCount == 2 (full N)" +for i in $(seq 1 30); do + gc="$(kubectl get podgroup "$GROUP" -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$gc" ] && break; sleep 2 +done +[ "$gc" = "2" ] || fail "gang group $GROUP minCount=$gc, want 2 (full N)" + +# There must be NO -workers subgroup (the old split is gone). +if kubectl get podgroup "${GROUP}-workers" >/dev/null 2>&1; then + fail "found ${GROUP}-workers PodGroup — the obsolete leader/worker split must not exist" +fi +log " gang group minCount=2, no -workers subgroup" + +# Submitter PodGroup -submitter exists with minCount 1 (schedules alone). +log "checking submitter group '$SUBMITTER' minCount == 1" +for i in $(seq 1 30); do + sc="$(kubectl get podgroup "$SUBMITTER" -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$sc" ] && break; sleep 2 +done +[ "$sc" = "1" ] || fail "submitter group $SUBMITTER minCount=$sc, want 1" + +# Submitter pod records the gang group it ungates, and is its own group. +gg="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/gang-group}' 2>/dev/null || true)" +[ "$gg" = "$GROUP" ] || fail "submitter gang-group annotation=$gg, want $GROUP" +sl="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" +[ "$sl" = "$SUBMITTER" ] || fail "submitter group label=$sl, want $SUBMITTER" +log " submitter group minCount=1, ungates gang '$GROUP'" + +# Gang pods stay in (NOT relinked) and are gated. +for p in ${GROUP}-0 ${GROUP}-1; do + g="$(kubectl get pod "$p" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" + [ "$g" = "$GROUP" ] || fail "$p group label=$g, want $GROUP (gang pods must not be relinked)" + gate="$(kubectl get pod "$p" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" + [ "$gate" = "quantum.braket/ready" ] || fail "$p not gated (gate=$gate)" +done +log " gang pods remain in '$GROUP' and are gated" + +log "PASS 7: gang(N=2, gated) + submitter(1, ungates gang), no leader/worker split" +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +kubectl delete pod "$SUBMITTER" --wait=false 2>/dev/null || true +for g in "$GROUP" "$SUBMITTER"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/04-gang-env-contract.sh b/test/e2e/quantum/04-gang-env-contract.sh new file mode 100755 index 0000000..19f2439 --- /dev/null +++ b/test/e2e/quantum/04-gang-env-contract.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Env-contract e2e (gang + submitter): verify the webhook injects, at admission, +# the env the runtime depends on — IN-CLUSTER, on the real pod specs, with no +# Braket/AWS and WITHOUT requiring scheduling. Guards the seam that, if broken, +# makes a gang schedule then hang or double-submit. +# +# Spec layer only (these are downward-API valueFrom refs whose VALUES resolve at +# placement, but whose PRESENCE is deterministic at admission), so no scheduling, +# no qpu capacity, no logs — it cannot flake on capacity. Contract: +# gang pod (faux): FLUENCE_FAUX_SUBMIT, FLUENCE_QUANTUM_JOB_ID, PYTHONPATH, FLUXION_BACKEND +# submitter: FLUENCE_GANG_GROUP on the sidecar (real submit, ungates the gang) +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +SUBMITTER=${GROUP}-submitter + +log "TEST 8: gang+submitter env contract — spec layer" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml + +# does container $2 of pod $1 have an env entry named $3 ? (spec-level only) +has_env() { + kubectl get pod "$1" -o jsonpath="{.spec.containers[?(@.name=='$2')].env[*].name}" \ + 2>/dev/null | tr ' ' '\n' | grep -qx "$3" +} + +log "checking the webhook wired the faux contract onto a gang pod" +for i in $(seq 1 15); do has_env ${GROUP}-0 app FLUENCE_FAUX_SUBMIT && break; sleep 2; done +for v in FLUENCE_FAUX_SUBMIT FLUENCE_QUANTUM_JOB_ID PYTHONPATH FLUXION_BACKEND; do + has_env ${GROUP}-0 app "$v" \ + || { kubectl get pod ${GROUP}-0 -o yaml | sed -n '/containers:/,/status:/p'; \ + fail "gang pod 'app' container missing env '$v'"; } + log " gang pod has env: $v" +done + +# The submitter's sidecar must know which gang to ungate. +log "checking the submitter sidecar has FLUENCE_GANG_GROUP=$GROUP" +for i in $(seq 1 30); do kubectl get pod "$SUBMITTER" >/dev/null 2>&1 && break; sleep 2; done +gg="$(kubectl get pod "$SUBMITTER" \ + -o jsonpath="{.spec.containers[?(@.name=='fluence-sidecar')].env[?(@.name=='FLUENCE_GANG_GROUP')].value}" \ + 2>/dev/null || true)" +[ "$gg" = "$GROUP" ] || fail "submitter sidecar FLUENCE_GANG_GROUP=$gg, want $GROUP" +log " submitter sidecar has FLUENCE_GANG_GROUP=$gg" + +# And the submitter must NOT be in faux mode (it does the real submit). +if has_env "$SUBMITTER" app FLUENCE_FAUX_SUBMIT; then + fail "submitter must NOT carry FLUENCE_FAUX_SUBMIT (it submits for real)" +fi +log " submitter is not faux" + +log "PASS 8: webhook injects the gang(faux) + submitter(real) env contract at admission" + +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +kubectl delete pod "$SUBMITTER" --wait=false 2>/dev/null || true +for g in "$GROUP" "$SUBMITTER"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/setup.sh b/test/e2e/quantum/setup.sh new file mode 100644 index 0000000..cf35020 --- /dev/null +++ b/test/e2e/quantum/setup.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Quantum suite setup (run by the e2e-suite workflow before the NN-*.sh tests). +# +# Installs the qpu add-on so nodes advertise fluxion.flux-framework.org/qpu — +# without it every quantum pod stays Pending (fluence matches in its own graph, +# but the default NodeResourcesFit plugin rejects each node because the extended +# resource is not in allocatable, so the match is rolled back). The base deploy +# (deploy/fluence-test.yaml) does NOT include this; it is quantum-only. +# +# Also points the webhook-injected sidecar/stage image at the CI-loaded image: +# the default sidecar image (ghcr.io/.../fluence-sidecar:latest) is not loaded in +# kind, so the submitter's containers could not pull. The fluence-stage init is +# fail-soft (no python in this image -> it logs and exits 0), which is fine for +# the structural assertions; the submitter still schedules and runs. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" +IMAGE="${IMAGE:-vanessa/fluence:test}" + +log "quantum setup: installing the qpu add-on (resources ConfigMap + device plugin)" +kubectl apply -f deploy/fluence-resources-test.yaml + +# Run the device plugin from the CI-loaded image (its manifest ships a registry +# image that kind has not pulled). Container name is 'deviceplugin'. +kubectl -n kube-system set image daemonset/fluence-deviceplugin deviceplugin="$IMAGE" +kubectl -n kube-system patch daemonset/fluence-deviceplugin --type=json \ + -p '[{"op":"replace","path":"/spec/template/spec/containers/0/imagePullPolicy","value":"IfNotPresent"}]' \ + 2>/dev/null || true + +# Injected sidecar + stage init must use a present image too (see header). +kubectl -n kube-system set env deployment/fluence-webhook FLUENCE_SIDECAR_IMAGE="$IMAGE" +kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=180s + +# Scheduler re-reads the resources config now that the ConfigMap exists. +kubectl -n kube-system rollout restart deployment/fluence +kubectl -n kube-system rollout status deployment/fluence --timeout=180s + +log "waiting for the device plugin DaemonSet to be Ready" +kubectl -n kube-system rollout status daemonset/fluence-deviceplugin --timeout=180s + +# Block until at least one node advertises the qpu extended resource, so the +# tests do not race the kubelet's device registration. +log "waiting for nodes to advertise fluxion.flux-framework.org/qpu" +ok=0 +for i in $(seq 1 60); do + if kubectl get nodes -o jsonpath='{.items[*].status.allocatable}' 2>/dev/null \ + | grep -q 'fluxion.flux-framework.org/qpu'; then + ok=1; break + fi + sleep 3 +done +[ "$ok" = 1 ] || fail "no node advertised fluxion.flux-framework.org/qpu after the add-on (device plugin not registering)" +log "qpu advertised on at least one node" + +log "quantum setup complete: qpu add-on installed, scheduler restarted, sidecar image=$IMAGE" \ No newline at end of file