Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions .github/workflows/e2e-suite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Reusable e2e workflow (workflow_call): shared setup (build image, kind, deploy
# fluence base), then run ONE test suite — a directory under test/e2e/. The
# suite's tests are DISCOVERED (every NN-*.sh, run in sorted order); adding a test
# is just dropping a file in the directory, no workflow edit. If the suite needs
# special preparation it provides a setup.sh in its directory, which is run before
# the tests (the gang suite has none; the quantum suite installs the qpu add-on).
name: e2e-suite
on:
workflow_call:
inputs:
suite:
description: "test suite directory name under test/e2e/ (e.g. gang, quantum)"
required: true
type: string

env:
IMAGE: vanessa/fluence:test

jobs:
run:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build fluence image
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
push: false
load: true
tags: ${{ env.IMAGE }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.10.0
with:
version: v0.32.0 # required for gang
node_image: kindest/node:v1.36.1
config: ./deploy/kind-config.yaml

- name: Free Disk Space (Ubuntu)
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
/opt/hostedtoolcache/CodeQL
sudo apt-get clean
df -h

- name: Load docker images
run: |
cluster=$(kind get clusters)
kind load --name "$cluster" docker-image ${{ env.IMAGE }}

- name: Deploy fluence (base)
run: |
kubectl apply -f deploy/fluence-test.yaml
kubectl rollout status -n kube-system deployment/fluence --timeout=180s
POD=""
for i in $(seq 1 60); do
POD=$(kubectl -n kube-system get pods -l app=fluence \
-o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true)
[ -n "$POD" ] && break
sleep 2
done
[ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; }
echo "Using pod: $POD"
sleep 5
kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" || true
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}'

# Per-suite special setup, if the suite directory provides one.
- name: Suite setup (${{ inputs.suite }})
run: |
s="test/e2e/${{ inputs.suite }}/setup.sh"
if [ -f "$s" ]; then
echo "running $s"
bash "$s"
else
echo "no setup.sh for suite '${{ inputs.suite }}' — skipping"
fi

# Discover and run every NN-*.sh in the suite directory, in sorted order.
- name: Run suite (${{ inputs.suite }})
run: |
dir="test/e2e/${{ inputs.suite }}"
[ -d "$dir" ] || { echo "ERROR: no such suite dir: $dir"; exit 1; }
shopt -s nullglob
tests=("$dir"/[0-9]*.sh)
[ ${#tests[@]} -gt 0 ] || { echo "ERROR: no NN-*.sh tests in $dir"; exit 1; }
IFS=$'\n' tests=($(sort <<<"${tests[*]}")); unset IFS
echo "discovered ${#tests[@]} test(s) in $dir:"
printf ' %s\n' "${tests[@]}"
for t in "${tests[@]}"; do
echo "::group::$t"
bash "$t"
echo "::endgroup::"
done

- name: Dump diagnostics on failure
if: failure()
run: |
kubectl get pods -A -o wide
kubectl logs -n kube-system deployment/fluence || true
kubectl logs -n kube-system deployment/fluence-webhook || true
145 changes: 10 additions & 135 deletions .github/workflows/e2e-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,140 +8,15 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
KIND_VERSION: v0.32.0
IMAGE: vanessa/fluence:test

jobs:
# Fan out the suites as parallel jobs, each a call into the reusable workflow.
# The shared setup (build, kind, deploy) lives once in e2e-suite.yaml; the
# matrix runs gang and quantum concurrently.
e2e:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build fluence image
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
push: false
load: true
tags: ${{ env.IMAGE }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.10.0
with:
version: v0.32.0 # required for gang
node_image: kindest/node:v1.36.1
config: ./deploy/kind-config.yaml

- name: Free Disk Space (Ubuntu)
run: |
echo "=== Disk space before cleanup ==="
df -h

# Remove large software runtimes and tools
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL

# Clean package caches
sudo apt-get clean
echo "=== Disk space after cleanup ==="
df -h

- name: Load docker images
run: |
kind get clusters
cluster=$(kind get clusters)
kind load --name $cluster docker-image vanessa/fluence:test

- name: Deploy fluence (base)
run: |
kubectl apply -f deploy/fluence-test.yaml
kubectl rollout status -n kube-system deployment/fluence --timeout=180s
# rollout status can return while the OLD ReplicaSet's pod is still
# Running (terminating). Selecting by phase=Running alone can grab that
# stale pod, which then 404s on exec/logs. Wait until exactly one
# fluence pod remains, and require it to be Ready and not terminating.
POD=""
for i in $(seq 1 60); do
# names of pods that are Ready AND have no deletionTimestamp (not terminating)
POD=$(kubectl -n kube-system get pods -l app=fluence \
-o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true)
[ -n "$POD" ] && break
sleep 2
done
[ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; }
echo "Using pod: $POD"
# Brief sleep to let the container runtime stabilize before exec
sleep 5
kubectl -n kube-system exec "$POD" -- ls /tmp/
kubectl -n kube-system logs "$POD"
kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}'

- name: E2E - classical gang
run: bash test/e2e/01-classical-gang.sh

- name: Deploy quantum add-on
run: |
# Includes the device plugin and oriented to testing container
kubectl apply -f deploy/fluence-resources-test.yaml
kubectl rollout restart -n kube-system deployment/fluence
kubectl rollout status -n kube-system deployment/fluence --timeout=60s
for i in $(seq 1 60); do
kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}'
kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break
sleep 1
done
# After a rollout restart BOTH the old and new pods are briefly Running.
# Select only a Ready pod with no deletionTimestamp (i.e. the new one,
# not the terminating old one) so exec/logs don't 404.
POD=""
for i in $(seq 1 60); do
POD=$(kubectl -n kube-system get pods -l app=fluence \
-o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true)
[ -n "$POD" ] && break
sleep 2
done
[ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found after restart"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; }
echo "Using pod: $POD"
# Brief sleep to let the container runtime stabilize before exec
sleep 5
kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"

- name: Wait for webhook
run: |

# wait for the deployment AND for the caBundle to be populated on the webhook config
kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s
for i in $(seq 1 30); do
cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \
-o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null)
[ -n "$cab" ] && break
sleep 2
done
# let TLS serving settle after caBundle patch
sleep 3

- name: E2E - quantum placement
run: bash test/e2e/02-quantum-placement.sh

#- name: E2E - restart recovery (no double-book)
# run: bash test/e2e/03-restart-recovery.sh

- name: E2E - sidecar ungate
run: bash test/e2e/04-sidecar-ungate.sh

- name: Dump diagnostics on failure
if: failure()
run: |
kubectl get pods -A -o wide
kubectl logs -n kube-system deployment/fluence
strategy:
fail-fast: false # one suite failing should not cancel the other
matrix:
suite: [gang, quantum]
uses: ./.github/workflows/e2e-suite.yaml
with:
suite: ${{ matrix.suite }}
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,16 @@ test-image-deploy: test-image
kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true
kubectl delete deployments --all
kubectl delete pods --all
kubectl delete -f deploy/fluence-test.yaml
kubectl delete -f deploy/fluence-test.yaml || true
kubectl delete pods --all

.PHONY: test-deploy-recreate
test-deploy-recreate: test-image-deploy
kubectl apply -f deploy/fluence-pull-test.yaml

.PHONY: deploy
deploy: ## Install RBAC + scheduler into kube-system
kubectl apply -f deploy/fluence.yaml
kubectl apply -f deploy/fluence-.yaml

.PHONY: help
help:
Expand Down
26 changes: 25 additions & 1 deletion cmd/webhook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@ package main
import (
"context"
"crypto/tls"
"flag"
"log"
"net/http"
"os"
"strings"
"time"

"github.com/converged-computing/fluence/pkg/cluster"
Expand All @@ -38,6 +40,29 @@ func main() {
cfgName := env("WEBHOOK_CONFIG", "fluence-webhook")
addr := env("WEBHOOK_ADDR", ":8443")

// Handler selection. By default ALL registered handlers are enabled. The
// operator may restrict the active set with --handlers (comma-separated) or
// the FLUENCE_HANDLERS env var, e.g. --handlers=fluxion,gang to run without
// quantum. An empty value means all enabled. Unknown names are warned about
// but not fatal (so config survives a handler being renamed/removed).
handlersFlag := flag.String("handlers", env("FLUENCE_HANDLERS", ""),
"comma-separated handlers in dispatch order (default: fluxion,quantum,gang). e.g. fluxion,gang disables quantum")
flag.Parse()

var requested []string
if *handlersFlag != "" {
for _, n := range strings.Split(*handlersFlag, ",") {
if n = strings.TrimSpace(n); n != "" {
requested = append(requested, n)
}
}
}
active, unknown := webhook.SetActiveHandlers(requested)
for _, n := range unknown {
log.Printf("[fluence-webhook] WARNING: unknown handler %q — ignoring", n)
}
log.Printf("[fluence-webhook] active handlers (in dispatch order): %v", active)

dnsNames := []string{
svc + "." + ns + ".svc",
svc + "." + ns + ".svc.cluster.local",
Expand Down Expand Up @@ -87,7 +112,6 @@ func main() {
mutator := &webhook.Mutator{
AttributeKeys: attrKeys,
Clientset: client,
SidecarImage: env("FLUENCE_SIDECAR_IMAGE", ""),
}
log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames())

Expand Down
Loading
Loading