converged-computing · vsoch · Jun 25, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/.github/workflows/e2e-suite.yaml b/.github/workflows/e2e-suite.yaml
@@ -0,0 +1,109 @@
+# Reusable e2e workflow (workflow_call): shared setup (build image, kind, deploy
+# fluence base), then run ONE test suite — a directory under test/e2e/. The
+# suite's tests are DISCOVERED (every NN-*.sh, run in sorted order); adding a test
+# is just dropping a file in the directory, no workflow edit. If the suite needs
+# special preparation it provides a setup.sh in its directory, which is run before
+# the tests (the gang suite has none; the quantum suite installs the qpu add-on).
+name: e2e-suite
+on:
+  workflow_call:
+    inputs:
+      suite:
+        description: "test suite directory name under test/e2e/ (e.g. gang, quantum)"
+        required: true
+        type: string
+
+env:
+  IMAGE: vanessa/fluence:test
+
+jobs:
+  run:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build fluence image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: false
+          load: true
+          tags: ${{ env.IMAGE }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Create k8s Kind Cluster
+        uses: helm/kind-action@v1.10.0
+        with:
+          version: v0.32.0              # required for gang
+          node_image: kindest/node:v1.36.1
+          config: ./deploy/kind-config.yaml
+
+      - name: Free Disk Space (Ubuntu)
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
+                      /opt/hostedtoolcache/CodeQL
+          sudo apt-get clean
+          df -h
+
+      - name: Load docker images
+        run: |
+          cluster=$(kind get clusters)
+          kind load --name "$cluster" docker-image ${{ env.IMAGE }}
+
+      - name: Deploy fluence (base)
+        run: |
+          kubectl apply -f deploy/fluence-test.yaml
+          kubectl rollout status -n kube-system deployment/fluence --timeout=180s
+          POD=""
+          for i in $(seq 1 60); do
+            POD=$(kubectl -n kube-system get pods -l app=fluence \
+              -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true)
+            [ -n "$POD" ] && break
+            sleep 2
+          done
+          [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; }
+          echo "Using pod: $POD"
+          sleep 5
+          kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" || true
+          kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}'
+
+      # Per-suite special setup, if the suite directory provides one.
+      - name: Suite setup (${{ inputs.suite }})
+        run: |
+          s="test/e2e/${{ inputs.suite }}/setup.sh"
+          if [ -f "$s" ]; then
+            echo "running $s"
+            bash "$s"
+          else
+            echo "no setup.sh for suite '${{ inputs.suite }}' — skipping"
+          fi
+
+      # Discover and run every NN-*.sh in the suite directory, in sorted order.
+      - name: Run suite (${{ inputs.suite }})
+        run: |
+          dir="test/e2e/${{ inputs.suite }}"
+          [ -d "$dir" ] || { echo "ERROR: no such suite dir: $dir"; exit 1; }
+          shopt -s nullglob
+          tests=("$dir"/[0-9]*.sh)
+          [ ${#tests[@]} -gt 0 ] || { echo "ERROR: no NN-*.sh tests in $dir"; exit 1; }
+          IFS=$'\n' tests=($(sort <<<"${tests[*]}")); unset IFS
+          echo "discovered ${#tests[@]} test(s) in $dir:"
+          printf '  %s\n' "${tests[@]}"
+          for t in "${tests[@]}"; do
+            echo "::group::$t"
+            bash "$t"
+            echo "::endgroup::"
+          done
+
+      - name: Dump diagnostics on failure
+        if: failure()
+        run: |
+          kubectl get pods -A -o wide
+          kubectl logs -n kube-system deployment/fluence || true
+          kubectl logs -n kube-system deployment/fluence-webhook || true
diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml
@@ -8,140 +8,15 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-env:
-  KIND_VERSION: v0.32.0
-  IMAGE: vanessa/fluence:test
-
 jobs:
+  # Fan out the suites as parallel jobs, each a call into the reusable workflow.
+  # The shared setup (build, kind, deploy) lives once in e2e-suite.yaml; the
+  # matrix runs gang and quantum concurrently.
   e2e:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build fluence image
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: ./Dockerfile
-          push: false
-          load: true
-          tags: ${{ env.IMAGE }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@v1.10.0
-        with:
-          version: v0.32.0              # required for gang
-          node_image: kindest/node:v1.36.1
-          config: ./deploy/kind-config.yaml
-
-      - name: Free Disk Space (Ubuntu)
-        run: |
-          echo "=== Disk space before cleanup ==="
-          df -h
-
-          # Remove large software runtimes and tools
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /opt/hostedtoolcache/CodeQL
-
-          # Clean package caches
-          sudo apt-get clean          
-          echo "=== Disk space after cleanup ==="
-          df -h
-
-      - name: Load docker images
-        run: |
-          kind get clusters
-          cluster=$(kind get clusters)
-          kind load --name $cluster docker-image vanessa/fluence:test
-
-      - name: Deploy fluence (base)
-        run: |
-          kubectl apply -f deploy/fluence-test.yaml
-          kubectl rollout status -n kube-system deployment/fluence --timeout=180s
-          # rollout status can return while the OLD ReplicaSet's pod is still
-          # Running (terminating). Selecting by phase=Running alone can grab that
-          # stale pod, which then 404s on exec/logs. Wait until exactly one
-          # fluence pod remains, and require it to be Ready and not terminating.
-          POD=""
-          for i in $(seq 1 60); do
-            # names of pods that are Ready AND have no deletionTimestamp (not terminating)
-            POD=$(kubectl -n kube-system get pods -l app=fluence \
-              -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true)
-            [ -n "$POD" ] && break
-            sleep 2
-          done
-          [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; }
-          echo "Using pod: $POD"
-          # Brief sleep to let the container runtime stabilize before exec
-          sleep 5
-          kubectl -n kube-system exec "$POD" -- ls /tmp/
-          kubectl -n kube-system logs "$POD"
-          kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"
-          kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}'
-
-      - name: E2E - classical gang
-        run: bash test/e2e/01-classical-gang.sh
-
-      - name: Deploy quantum add-on
-        run: |
-          # Includes the device plugin and oriented to testing container
-          kubectl apply -f deploy/fluence-resources-test.yaml
-          kubectl rollout restart -n kube-system deployment/fluence
-          kubectl rollout status  -n kube-system deployment/fluence --timeout=60s
-          for i in $(seq 1 60); do
-            kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}'
-            kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break
-            sleep 1
-          done
-          # After a rollout restart BOTH the old and new pods are briefly Running.
-          # Select only a Ready pod with no deletionTimestamp (i.e. the new one,
-          # not the terminating old one) so exec/logs don't 404.
-          POD=""
-          for i in $(seq 1 60); do
-            POD=$(kubectl -n kube-system get pods -l app=fluence \
-              -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true)
-            [ -n "$POD" ] && break
-            sleep 2
-          done
-          [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found after restart"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; }
-          echo "Using pod: $POD"
-          # Brief sleep to let the container runtime stabilize before exec
-          sleep 5
-          kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"
-
-      - name: Wait for webhook
-        run: |
-
-          # wait for the deployment AND for the caBundle to be populated on the webhook config
-          kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s
-          for i in $(seq 1 30); do
-            cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \
-                  -o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null)
-            [ -n "$cab" ] && break
-            sleep 2
-          done
-          # let TLS serving settle after caBundle patch
-          sleep 3 
-
-      - name: E2E - quantum placement
-        run: bash test/e2e/02-quantum-placement.sh
-
-      #- name: E2E - restart recovery (no double-book)
-      #  run: bash test/e2e/03-restart-recovery.sh
-
-      - name: E2E - sidecar ungate
-        run: bash test/e2e/04-sidecar-ungate.sh
-
-      - name: Dump diagnostics on failure
-        if: failure()
-        run: |
-          kubectl get pods -A -o wide
-          kubectl logs -n kube-system deployment/fluence
+    strategy:
+      fail-fast: false        # one suite failing should not cancel the other
+      matrix:
+        suite: [gang, quantum]
+    uses: ./.github/workflows/e2e-suite.yaml
+    with:
+      suite: ${{ matrix.suite }}
diff --git a/Makefile b/Makefile
@@ -55,13 +55,16 @@ test-image-deploy: test-image
 	kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true
 	kubectl delete deployments --all
 	kubectl delete pods --all
-	kubectl delete -f deploy/fluence-test.yaml
+	kubectl delete -f deploy/fluence-test.yaml || true
 	kubectl delete pods --all
 
+.PHONY: test-deploy-recreate
+test-deploy-recreate: test-image-deploy
+	kubectl apply -f deploy/fluence-pull-test.yaml
 
 .PHONY: deploy
 deploy: ## Install RBAC + scheduler into kube-system
-	kubectl apply -f deploy/fluence.yaml
+	kubectl apply -f deploy/fluence-.yaml
 
 .PHONY: help
 help:

diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go
@@ -12,9 +12,11 @@ package main
 import (
 	"context"
 	"crypto/tls"
+	"flag"
 	"log"
 	"net/http"
 	"os"
+	"strings"
 	"time"
 
 	"github.com/converged-computing/fluence/pkg/cluster"
@@ -38,6 +40,29 @@ func main() {
 	cfgName := env("WEBHOOK_CONFIG", "fluence-webhook")
 	addr := env("WEBHOOK_ADDR", ":8443")
 
+	// Handler selection. By default ALL registered handlers are enabled. The
+	// operator may restrict the active set with --handlers (comma-separated) or
+	// the FLUENCE_HANDLERS env var, e.g. --handlers=fluxion,gang to run without
+	// quantum. An empty value means all enabled. Unknown names are warned about
+	// but not fatal (so config survives a handler being renamed/removed).
+	handlersFlag := flag.String("handlers", env("FLUENCE_HANDLERS", ""),
+		"comma-separated handlers in dispatch order (default: fluxion,quantum,gang). e.g. fluxion,gang disables quantum")
+	flag.Parse()
+
+	var requested []string
+	if *handlersFlag != "" {
+		for _, n := range strings.Split(*handlersFlag, ",") {
+			if n = strings.TrimSpace(n); n != "" {
+				requested = append(requested, n)
+			}
+		}
+	}
+	active, unknown := webhook.SetActiveHandlers(requested)
+	for _, n := range unknown {
+		log.Printf("[fluence-webhook] WARNING: unknown handler %q — ignoring", n)
+	}
+	log.Printf("[fluence-webhook] active handlers (in dispatch order): %v", active)
+
 	dnsNames := []string{
 		svc + "." + ns + ".svc",
 		svc + "." + ns + ".svc.cluster.local",
@@ -87,7 +112,6 @@ func main() {
 	mutator := &webhook.Mutator{
 		AttributeKeys: attrKeys,
 		Clientset:     client,
-		SidecarImage:  env("FLUENCE_SIDECAR_IMAGE", ""),
 	}
 	log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames())