diff --git a/assets/components/c2cc/clusterrole.yaml b/assets/components/c2cc/clusterrole.yaml new file mode 100644 index 0000000000..affd57f978 --- /dev/null +++ b/assets/components/c2cc/clusterrole.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: microshift-c2cc-probe +rules: +- apiGroups: + - microshift.io + resources: + - remoteclusters + verbs: + - get + - list + - watch +- apiGroups: + - microshift.io + resources: + - remoteclusters/status + verbs: + - update + - patch +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - use + resourceNames: + - privileged diff --git a/assets/components/c2cc/clusterrolebinding.yaml b/assets/components/c2cc/clusterrolebinding.yaml new file mode 100644 index 0000000000..834bb81bb8 --- /dev/null +++ b/assets/components/c2cc/clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: microshift-c2cc-probe +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: microshift-c2cc-probe +subjects: +- kind: ServiceAccount + namespace: openshift-c2cc + name: c2cc-probe diff --git a/assets/components/c2cc/deployment.yaml b/assets/components/c2cc/deployment.yaml new file mode 100644 index 0000000000..9222fad6bd --- /dev/null +++ b/assets/components/c2cc/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: openshift-c2cc + name: c2cc-probe + labels: + app: c2cc-probe +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: c2cc-probe + template: + metadata: + labels: + app: c2cc-probe + annotations: + target.workload.openshift.io/management: '{"effect": "PreferredDuringScheduling"}' + openshift.io/required-scc: privileged + spec: + serviceAccountName: c2cc-probe + containers: + - name: c2cc-probe + image: '{{ .ReleaseImage.cli }}' + imagePullPolicy: IfNotPresent + command: + - /host/usr/bin/microshift + - c2cc-probe + ports: + - containerPort: 8080 + name: probe + protocol: TCP + livenessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 50m + memory: 64Mi + volumeMounts: + - name: microshift-binary + mountPath: /host/usr/bin/microshift + readOnly: true + volumes: + - name: microshift-binary + hostPath: + path: /usr/bin/microshift + type: File + nodeSelector: + node-role.kubernetes.io/master: "" + priorityClassName: system-cluster-critical + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node.kubernetes.io/unreachable + operator: Exists + effect: NoExecute + tolerationSeconds: 120 + - key: node.kubernetes.io/not-ready + operator: Exists + effect: NoExecute + tolerationSeconds: 120 diff --git a/assets/components/c2cc/namespace.yaml b/assets/components/c2cc/namespace.yaml new file mode 100644 index 0000000000..e99882087a --- /dev/null +++ b/assets/components/c2cc/namespace.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-c2cc + labels: + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged + annotations: + openshift.io/node-selector: "" + workload.openshift.io/allowed: "management" diff --git a/assets/components/c2cc/service.yaml b/assets/components/c2cc/service.yaml new file mode 100644 index 0000000000..b5783dbf49 --- /dev/null +++ b/assets/components/c2cc/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + namespace: openshift-c2cc + name: c2cc-probe +spec: + clusterIP: '{{ .ProbeServiceClusterIP }}' + ports: + - name: probe + port: 8080 + targetPort: 8080 + protocol: TCP + selector: + app: c2cc-probe diff --git a/assets/components/c2cc/serviceaccount.yaml b/assets/components/c2cc/serviceaccount.yaml new file mode 100644 index 0000000000..154825af4c --- /dev/null +++ b/assets/components/c2cc/serviceaccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + namespace: openshift-c2cc + name: c2cc-probe diff --git a/assets/crd/microshift.io_remoteclusters.yaml b/assets/crd/microshift.io_remoteclusters.yaml index 865c0e9baa..daf41edbed 100644 --- a/assets/crd/microshift.io_remoteclusters.yaml +++ b/assets/crd/microshift.io_remoteclusters.yaml @@ -53,8 +53,28 @@ spec: - probeTarget type: object status: - description: RemoteClusterStatus is populated by the probe pod in a future - ticket. + description: RemoteClusterStatus is populated by the probe pod with health + probe results. + properties: + errors: + items: + type: string + type: array + lastProbeTime: + format: date-time + type: string + lastSuccessfulProbe: + format: date-time + type: string + state: + default: NeverProbed + enum: + - NeverProbed + - Healthy + - Unhealthy + type: string + required: + - state type: object required: - spec diff --git a/cmd/microshift/main.go b/cmd/microshift/main.go index bf275fb316..a35978c32b 100644 --- a/cmd/microshift/main.go +++ b/cmd/microshift/main.go @@ -42,5 +42,6 @@ func newCommand() *cobra.Command { cmd.AddCommand(cmds.NewRestoreCommand()) cmd.AddCommand(cmds.NewHealthcheckCommand()) cmd.AddCommand(cmds.NewAddNodeCommand()) + cmd.AddCommand(cmds.NewC2CCProbeCommand()) return cmd } diff --git a/pkg/apis/microshift/v1alpha1/types.go b/pkg/apis/microshift/v1alpha1/types.go index 909ccef207..b3b68cbbd2 100644 --- a/pkg/apis/microshift/v1alpha1/types.go +++ b/pkg/apis/microshift/v1alpha1/types.go @@ -31,8 +31,18 @@ type RemoteClusterSpec struct { ProbeInterval metav1.Duration `json:"probeInterval"` } -// RemoteClusterStatus is populated by the probe pod in a future ticket. -type RemoteClusterStatus struct{} +// RemoteClusterStatus is populated by the probe pod with health probe results. +type RemoteClusterStatus struct { + // +kubebuilder:validation:Enum=NeverProbed;Healthy;Unhealthy + // +kubebuilder:default="NeverProbed" + State string `json:"state"` + // +optional + LastSuccessfulProbe *metav1.Time `json:"lastSuccessfulProbe,omitempty"` + // +optional + LastProbeTime *metav1.Time `json:"lastProbeTime,omitempty"` + // +optional + Errors []string `json:"errors,omitempty"` +} // +kubebuilder:object:root=true // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/pkg/apis/microshift/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/microshift/v1alpha1/zz_generated.deepcopy.go index 26ac772e45..0cdb32da4d 100644 --- a/pkg/apis/microshift/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/microshift/v1alpha1/zz_generated.deepcopy.go @@ -14,7 +14,7 @@ func (in *RemoteCluster) DeepCopyInto(out *RemoteCluster) { out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) out.Spec = in.Spec - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoteCluster. @@ -86,6 +86,19 @@ func (in *RemoteClusterSpec) DeepCopy() *RemoteClusterSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RemoteClusterStatus) DeepCopyInto(out *RemoteClusterStatus) { *out = *in + if in.LastSuccessfulProbe != nil { + in, out := &in.LastSuccessfulProbe, &out.LastSuccessfulProbe + *out = (*in).DeepCopy() + } + if in.LastProbeTime != nil { + in, out := &in.LastProbeTime, &out.LastProbeTime + *out = (*in).DeepCopy() + } + if in.Errors != nil { + in, out := &in.Errors, &out.Errors + *out = make([]string, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoteClusterStatus. diff --git a/pkg/cmd/c2cc_probe.go b/pkg/cmd/c2cc_probe.go new file mode 100644 index 0000000000..d2e07c79ce --- /dev/null +++ b/pkg/cmd/c2cc_probe.go @@ -0,0 +1,17 @@ +package cmd + +import ( + "github.com/openshift/microshift/pkg/controllers/c2cc" + "github.com/spf13/cobra" +) + +func NewC2CCProbeCommand() *cobra.Command { + return &cobra.Command{ + Use: "c2cc-probe", + Short: "Run C2CC remote cluster probe (designed to run as a pod)", + Hidden: true, + RunE: func(cmd *cobra.Command, _ []string) error { + return c2cc.RunProbe(cmd.Context()) + }, + } +} diff --git a/pkg/controllers/c2cc/controller.go b/pkg/controllers/c2cc/controller.go index 500f94ebdf..397edd98f7 100644 --- a/pkg/controllers/c2cc/controller.go +++ b/pkg/controllers/c2cc/controller.go @@ -225,6 +225,7 @@ func (c *C2CCRouteManager) fullReconcile(ctx context.Context) { {"service-routes", c.svcRoutes.reconcile}, {"nftables", c.nftMgr.reconcile}, {"healthcheck-crs", c.healthcheck.reconcile}, + {"probe-deployment", c.deployProbe}, } for _, s := range subsystems { if err := s.fn(ctx); err != nil { @@ -258,6 +259,15 @@ func (c *C2CCRouteManager) cleanupAll(ctx context.Context) { if c.nftMgr != nil { cleanups = append(cleanups, cleanable{"nftables", c.nftMgr.cleanup}) } + cleanups = append(cleanups, cleanable{"probe-namespace", func(ctx context.Context) error { + return assets.DeleteNamespaces(ctx, c2ccNamespace, c.kubeconfig) + }}) + cleanups = append(cleanups, cleanable{"probe-clusterrolebinding", func(ctx context.Context) error { + return assets.DeleteClusterRoleBindings(ctx, c2ccClusterRoleBinding, c.kubeconfig) + }}) + cleanups = append(cleanups, cleanable{"probe-clusterrole", func(ctx context.Context) error { + return assets.DeleteClusterRoles(ctx, c2ccClusterRole, c.kubeconfig) + }}) cleanups = append(cleanups, cleanable{"healthcheck-crd", func(ctx context.Context) error { return assets.DeleteCRDs(ctx, healthcheckCRD, c.kubeconfig) }}) diff --git a/pkg/controllers/c2cc/deploy_probe.go b/pkg/controllers/c2cc/deploy_probe.go new file mode 100644 index 0000000000..cdb72b22f6 --- /dev/null +++ b/pkg/controllers/c2cc/deploy_probe.go @@ -0,0 +1,73 @@ +package c2cc + +import ( + "bytes" + "context" + "fmt" + "net" + "text/template" + + "github.com/apparentlymart/go-cidr/cidr" + "github.com/openshift/microshift/pkg/assets" + "github.com/openshift/microshift/pkg/release" + "k8s.io/klog/v2" +) + +var ( + c2ccNamespace = []string{"components/c2cc/namespace.yaml"} + c2ccServiceAccount = []string{"components/c2cc/serviceaccount.yaml"} + c2ccClusterRole = []string{"components/c2cc/clusterrole.yaml"} + c2ccClusterRoleBinding = []string{"components/c2cc/clusterrolebinding.yaml"} + c2ccDeployment = []string{"components/c2cc/deployment.yaml"} + c2ccService = []string{"components/c2cc/service.yaml"} +) + +func (c *C2CCRouteManager) deployProbe(ctx context.Context) error { + _, svcNet, err := net.ParseCIDR(c.cfg.Network.ServiceNetwork[0]) + if err != nil { + return fmt.Errorf("failed to parse local service network: %w", err) + } + probeIP, err := cidr.Host(svcNet, 11) + if err != nil { + return fmt.Errorf("failed to compute probe service ClusterIP: %w", err) + } + + params := assets.RenderParams{ + "ReleaseImage": release.Image, + "ProbeServiceClusterIP": probeIP.String(), + } + + if err := assets.ApplyNamespaces(ctx, c2ccNamespace, c.kubeconfig); err != nil { + return fmt.Errorf("failed to apply c2cc namespace: %w", err) + } + if err := assets.ApplyServiceAccounts(ctx, c2ccServiceAccount, c.kubeconfig); err != nil { + return fmt.Errorf("failed to apply c2cc service account: %w", err) + } + if err := assets.ApplyClusterRoles(ctx, c2ccClusterRole, c.kubeconfig); err != nil { + return fmt.Errorf("failed to apply c2cc cluster role: %w", err) + } + if err := assets.ApplyClusterRoleBindings(ctx, c2ccClusterRoleBinding, c.kubeconfig); err != nil { + return fmt.Errorf("failed to apply c2cc cluster role binding: %w", err) + } + if err := assets.ApplyDeployments(ctx, c2ccDeployment, renderTemplate, params, c.kubeconfig); err != nil { + return fmt.Errorf("failed to apply c2cc deployment: %w", err) + } + if err := assets.ApplyServices(ctx, c2ccService, renderTemplate, params, c.kubeconfig); err != nil { + return fmt.Errorf("failed to apply c2cc service: %w", err) + } + + klog.V(4).Infof("C2CC probe assets deployed (probe ClusterIP=%s)", probeIP) + return nil +} + +func renderTemplate(tb []byte, data assets.RenderParams) ([]byte, error) { + tmpl, err := template.New("").Option("missingkey=error").Parse(string(tb)) + if err != nil { + return nil, err + } + var buf bytes.Buffer + if err := tmpl.Execute(&buf, data); err != nil { + return nil, err + } + return buf.Bytes(), nil +} diff --git a/pkg/controllers/c2cc/probe.go b/pkg/controllers/c2cc/probe.go new file mode 100644 index 0000000000..77b8135f99 --- /dev/null +++ b/pkg/controllers/c2cc/probe.go @@ -0,0 +1,241 @@ +package c2cc + +import ( + "context" + "fmt" + "net/http" + "sync" + "time" + + microshiftv1alpha1 "github.com/openshift/microshift/pkg/apis/microshift/v1alpha1" + microshiftclientset "github.com/openshift/microshift/pkg/generated/clientset/versioned" + microshiftinformers "github.com/openshift/microshift/pkg/generated/informers/externalversions" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" +) + +const ( + unhealthyThreshold = 3 + probeHTTPTimeout = 5 * time.Second + informerResync = 30 * time.Second +) + +// RunProbe is the entrypoint for the healthcheck-probe subcommand. +// It runs inside a pod on the cluster network, serving as both a probe +// target (HTTP :8080) and an active prober of remote clusters. +func RunProbe(ctx context.Context) error { + restCfg, err := rest.InClusterConfig() + if err != nil { + return fmt.Errorf("failed to build in-cluster config: %w", err) + } + + msClient, err := microshiftclientset.NewForConfig(restCfg) + if err != nil { + return fmt.Errorf("failed to create microshift client: %w", err) + } + + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + if _, err := fmt.Fprint(w, "ok"); err != nil { + klog.Errorf("Failed to write probe response: %v", err) + } + }) + server := &http.Server{ + Addr: ":8080", + Handler: mux, + ReadHeaderTimeout: 10 * time.Second, + } + + go func() { + klog.Infof("Starting probe target HTTP server on :8080") + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + klog.Errorf("Probe HTTP server error: %v", err) + } + }() + + pm := &probeManager{ + client: msClient, + probes: make(map[string]context.CancelFunc), + } + + factory := microshiftinformers.NewSharedInformerFactory(msClient, informerResync) + informer := factory.Microshift().V1alpha1().RemoteClusters().Informer() + + if _, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + if rc, ok := obj.(*microshiftv1alpha1.RemoteCluster); ok { + pm.startProbe(ctx, rc) + } + }, + UpdateFunc: func(oldObj, newObj interface{}) { + oldRC, ok1 := oldObj.(*microshiftv1alpha1.RemoteCluster) + newRC, ok2 := newObj.(*microshiftv1alpha1.RemoteCluster) + if ok1 && ok2 && (oldRC.Spec.ProbeTarget != newRC.Spec.ProbeTarget || + oldRC.Spec.ProbeInterval != newRC.Spec.ProbeInterval) { + pm.restartProbe(ctx, newRC) + } + }, + DeleteFunc: func(obj interface{}) { + rc, ok := obj.(*microshiftv1alpha1.RemoteCluster) + if !ok { + if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { + rc, _ = tombstone.Obj.(*microshiftv1alpha1.RemoteCluster) + } + } + if rc != nil { + pm.stopProbe(rc.Name) + } + }, + }); err != nil { + return fmt.Errorf("failed to add RemoteCluster informer handlers: %w", err) + } + + factory.Start(ctx.Done()) + factory.WaitForCacheSync(ctx.Done()) + klog.Infof("Probe manager running, watching RemoteCluster CRs") + + <-ctx.Done() + pm.stopAll() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := server.Shutdown(shutdownCtx); err != nil { //nolint:contextcheck // parent ctx is already cancelled + klog.Errorf("Probe HTTP server shutdown error: %v", err) + } + klog.Infof("Probe manager shut down") + return nil +} + +type probeManager struct { + client microshiftclientset.Interface + mu sync.Mutex + probes map[string]context.CancelFunc +} + +func (pm *probeManager) startProbe(ctx context.Context, rc *microshiftv1alpha1.RemoteCluster) { + pm.mu.Lock() + defer pm.mu.Unlock() + + if _, exists := pm.probes[rc.Name]; exists { + return + } + + probeCtx, cancel := context.WithCancel(ctx) + pm.probes[rc.Name] = cancel + + klog.Infof("Starting probe for %q (target=%s, interval=%s)", + rc.Name, rc.Spec.ProbeTarget, rc.Spec.ProbeInterval.Duration) + go pm.runProbeLoop(probeCtx, rc.Name, rc.Spec.ProbeTarget, rc.Spec.ProbeInterval.Duration) +} + +func (pm *probeManager) restartProbe(ctx context.Context, rc *microshiftv1alpha1.RemoteCluster) { + pm.stopProbe(rc.Name) + pm.startProbe(ctx, rc) +} + +func (pm *probeManager) stopProbe(name string) { + pm.mu.Lock() + defer pm.mu.Unlock() + + if cancel, exists := pm.probes[name]; exists { + cancel() + delete(pm.probes, name) + klog.Infof("Stopped probe for %q", name) + } +} + +func (pm *probeManager) stopAll() { + pm.mu.Lock() + defer pm.mu.Unlock() + + for name, cancel := range pm.probes { + cancel() + delete(pm.probes, name) + } +} + +func (pm *probeManager) runProbeLoop(ctx context.Context, name, target string, interval time.Duration) { + httpClient := &http.Client{Timeout: probeHTTPTimeout} + consecutiveFailures := 0 + url := "http://" + target + "/" + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + probeErr := doProbe(ctx, httpClient, url) + now := metav1.Now() + + status := microshiftv1alpha1.RemoteClusterStatus{ + LastProbeTime: &now, + } + + if probeErr != nil { + consecutiveFailures++ + klog.V(2).Infof("Probe %q failed (%d consecutive): %v", name, consecutiveFailures, probeErr) + + if consecutiveFailures >= unhealthyThreshold { + status.State = "Unhealthy" + } else { + status.State = "Healthy" + } + status.Errors = []string{probeErr.Error()} + } else { + consecutiveFailures = 0 + status.State = "Healthy" + status.LastSuccessfulProbe = &now + } + + if err := pm.updateStatus(ctx, name, status); err != nil { + klog.Errorf("Failed to update status for %q: %v", name, err) + } + } + } +} + +func doProbe(ctx context.Context, client *http.Client, url string) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + resp, err := client.Do(req) // #nosec G704 -- URL built from trusted RemoteCluster CR spec + if err != nil { + return fmt.Errorf("failed to execute probe request: %w", err) + } + defer func() { + if err := resp.Body.Close(); err != nil { + klog.Errorf("Failed to close probe response body: %v", err) + } + }() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("failed with unexpected status %d", resp.StatusCode) + } + return nil +} + +func (pm *probeManager) updateStatus(ctx context.Context, name string, status microshiftv1alpha1.RemoteClusterStatus) error { + rcClient := pm.client.MicroshiftV1alpha1().RemoteClusters() + + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + rc, err := rcClient.Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get RemoteCluster %q: %w", name, err) + } + + // Preserve LastSuccessfulProbe from the existing status if this probe failed + if rc.Status.LastSuccessfulProbe != nil && status.LastSuccessfulProbe == nil { + status.LastSuccessfulProbe = rc.Status.LastSuccessfulProbe + } + + rc.Status = status + _, err = rcClient.UpdateStatus(ctx, rc, metav1.UpdateOptions{}) + return err + }) +} diff --git a/scripts/auto-rebase/assets.yaml b/scripts/auto-rebase/assets.yaml index e26b33f49e..b4f34d3f6c 100644 --- a/scripts/auto-rebase/assets.yaml +++ b/scripts/auto-rebase/assets.yaml @@ -291,6 +291,16 @@ assets: - file: release-multus-aarch64.json - file: release-multus-x86_64.json + - dir: components/c2cc/ + ignore: "C2CC probe pod assets - MicroShift specific" + files: + - file: clusterrole.yaml + - file: clusterrolebinding.yaml + - file: deployment.yaml + - file: namespace.yaml + - file: service.yaml + - file: serviceaccount.yaml + - dir: optional/observability/ ignore: "they don't exist in upstream repository - only in microshift" files: diff --git a/test/resources/c2cc.resource b/test/resources/c2cc.resource index 0cc408bc8e..a7b77c96f4 100644 --- a/test/resources/c2cc.resource +++ b/test/resources/c2cc.resource @@ -290,6 +290,15 @@ Service Endpoints Should Exist ... oc get endpoints hello-microshift -n ${ns} -o jsonpath='{.subsets[0].addresses[0].ip}' Should Not Be Empty ${stdout} +Compute 11th IP + [Documentation] Return the 11th host address in a CIDR (e.g. 10.43.0.0/16 -> 10.43.0.11). + [Arguments] ${cidr} + VAR ${cmd}= import ipaddress; n=ipaddress.ip_network('${cidr}', strict=False); print(n[11]) + ${result}= Process.Run Process python3 -c ${cmd} + Should Be Equal As Integers ${result.rc} 0 + ${ip}= Strip String ${result.stdout} + RETURN ${ip} + Cleanup Test Workloads [Documentation] Delete test namespace on all clusters. Ignores errors. FOR ${alias} IN cluster-a cluster-b cluster-c diff --git a/test/suites/c2cc/healthcheck.robot b/test/suites/c2cc/healthcheck.robot index 4b048fc59e..ec79765511 100644 --- a/test/suites/c2cc/healthcheck.robot +++ b/test/suites/c2cc/healthcheck.robot @@ -117,15 +117,6 @@ Verify RemoteCluster CR Spec Should Match Regexp ${interval} ^[0-9]+(s|m|h)$ END -Compute 11th IP - [Documentation] Return the 11th host address in a CIDR (e.g. 10.43.0.0/16 -> 10.43.0.11). - [Arguments] ${cidr} - VAR ${cmd}= import ipaddress; n=ipaddress.ip_network('${cidr}', strict=False); print(n[11]) - ${result}= Process.Run Process python3 -c ${cmd} - Should Be Equal As Integers ${result.rc} 0 - ${ip}= Strip String ${result.stdout} - RETURN ${ip} - Verify RemoteCluster CR Label [Documentation] Verify all RemoteCluster CRs have the app.kubernetes.io/managed-by=c2cc-route-manager label. [Arguments] ${alias} diff --git a/test/suites/c2cc/probe.robot b/test/suites/c2cc/probe.robot new file mode 100644 index 0000000000..e58e123193 --- /dev/null +++ b/test/suites/c2cc/probe.robot @@ -0,0 +1,210 @@ +*** Settings *** +Documentation Verify C2CC probe pod deployment and health status reporting. +... Checks that the probe pod is deployed, the Service has the correct +... ClusterIP, RemoteCluster CRs transition to Healthy, and the +... deployment self-heals after deletion. + +Resource ../../resources/microshift-process.resource +Resource ../../resources/kubeconfig.resource +Resource ../../resources/oc.resource +Resource ../../resources/c2cc.resource + +Suite Setup Setup +Suite Teardown Teardown + +Test Tags c2cc + + +*** Variables *** +${C2CC_NAMESPACE} openshift-c2cc +${PROBE_DEPLOYMENT} c2cc-probe + + +*** Test Cases *** +Probe Namespace Exists + [Documentation] Verify the openshift-c2cc namespace exists on all clusters. + FOR ${alias} IN cluster-a cluster-b cluster-c + ${stdout}= Oc On Cluster ${alias} oc get namespace ${C2CC_NAMESPACE} -o name + Should Contain ${stdout} namespace/${C2CC_NAMESPACE} + END + +Probe Deployment Running + [Documentation] Verify the c2cc-probe deployment is running with 1 ready replica. + FOR ${alias} IN cluster-a cluster-b cluster-c + Wait Until Keyword Succeeds 2m 10s + ... Verify Probe Pod Is Ready ${alias} + END + +Probe Service Has Correct ClusterIP + [Documentation] Verify the probe service has the 11th IP of the local service CIDR. + Verify Probe Service ClusterIP cluster-a ${CLUSTER_A_SVC_CIDR} + Verify Probe Service ClusterIP cluster-b ${CLUSTER_B_SVC_CIDR} + Verify Probe Service ClusterIP cluster-c ${CLUSTER_C_SVC_CIDR} + +RemoteCluster Status Becomes Healthy + [Documentation] Wait for RemoteCluster CRs to transition to Healthy on all clusters. + Wait Until Keyword Succeeds 3m 10s + ... Verify RemoteCluster State cluster-a Healthy + Wait Until Keyword Succeeds 3m 10s + ... Verify RemoteCluster State cluster-b Healthy + Wait Until Keyword Succeeds 3m 10s + ... Verify RemoteCluster State cluster-c Healthy + +RemoteCluster Status Has LastProbeTime + [Documentation] Verify that LastProbeTime is populated on all RemoteCluster CRs. + FOR ${alias} IN cluster-a cluster-b cluster-c + ${stdout}= Oc On Cluster ${alias} + ... oc get remoteclusters.microshift.io -o jsonpath='{.items[*].status.lastProbeTime}' + Should Not Be Empty ${stdout} + @{timestamps}= Split String ${stdout} + ${count}= Get Length ${timestamps} + Should Be Equal As Integers ${count} 2 Expected 2 RemoteCluster states, got ${count} + FOR ${t} IN @{timestamps} + Should Not Be Empty ${t} + END + END + +RemoteCluster Status Has LastSuccessfulProbe + [Documentation] Verify that LastSuccessfulProbe is populated on all RemoteCluster CRs. + FOR ${alias} IN cluster-a cluster-b cluster-c + ${stdout}= Oc On Cluster ${alias} + ... oc get remoteclusters.microshift.io -o jsonpath='{.items[*].status.lastSuccessfulProbe}' + Should Not Be Empty ${stdout} + @{timestamps}= Split String ${stdout} + ${count}= Get Length ${timestamps} + Should Be Equal As Integers ${count} 2 Expected 2 RemoteCluster states, got ${count} + FOR ${t} IN @{timestamps} + Should Not Be Empty ${t} + END + END + +Probe Deployment Self-Heals After Deletion + [Documentation] Delete the probe deployment and verify it is recreated by the controller. + Oc On Cluster cluster-a + ... oc delete deployment ${PROBE_DEPLOYMENT} -n ${C2CC_NAMESPACE} + Wait Until Keyword Succeeds 2m 10s + ... Verify Probe Pod Is Ready cluster-a + +Probe Deployment Self-Heals After Scale Down + [Documentation] Scale down the probe deployment to 0 and verify it is restored to 1. + Oc On Cluster cluster-a + ... oc scale deployment ${PROBE_DEPLOYMENT} -n ${C2CC_NAMESPACE} --replicas=0 + Wait Until Keyword Succeeds 2m 10s + ... Verify Probe Pod Is Ready cluster-a + +RemoteCluster Status Becomes Unhealthy When Probe Fails + [Documentation] Block probe traffic on cluster-b and verify cluster-a + ... reports Unhealthy for the corresponding RemoteCluster CR. + [Setup] Ensure All Clusters Healthy + ${cr_name}= RemoteCluster CR Name From IP ${HOST2_IP} + # Apply a NetworkPolicy on cluster-b that denies all ingress to the probe pod, + # causing cluster-a's probes to cluster-b to time out. + Apply Probe Deny Policy cluster-b + # Wait for cluster-a to report Unhealthy (requires 3 consecutive failures) + Wait Until Keyword Succeeds 3m 10s + ... Verify RemoteCluster State By Name cluster-a ${cr_name} Unhealthy + # Verify the Errors field is populated in the CR status + ${errors}= Get RemoteCluster Errors By Name cluster-a ${cr_name} + Should Not Be Empty ${errors} + [Teardown] Run Keywords + ... Delete Probe Deny Policy cluster-b + ... AND Wait Until Keyword Succeeds 3m 10s + ... Verify RemoteCluster State By Name cluster-a ${cr_name} Healthy + + +*** Keywords *** +Setup + [Documentation] Set up SSH connections and kubeconfigs for all clusters. + Check Required Env Variables + Login MicroShift Host + Setup Kubeconfig + Register Local Cluster cluster-a + Register Remote Cluster cluster-b ${HOST2_IP} ${HOST2_SSH_PORT} ${KUBECONFIG_B} + Register Remote Cluster cluster-c ${HOST3_IP} ${HOST3_SSH_PORT} ${KUBECONFIG_C} + +Teardown + [Documentation] Close all connections and clean up kubeconfigs. + Teardown All Remote Clusters + Remove Kubeconfig + Logout MicroShift Host + +Verify Probe Pod Is Ready + [Documentation] Check that the probe deployment has 1 available replica. + [Arguments] ${alias} + ${stdout}= Oc On Cluster ${alias} + ... oc get deployment ${PROBE_DEPLOYMENT} -n ${C2CC_NAMESPACE} -o jsonpath='{.status.availableReplicas}' + Should Be Equal As Strings ${stdout} 1 + +Verify Probe Service ClusterIP + [Documentation] Verify that the probe service ClusterIP matches the 11th IP of the given CIDR. + [Arguments] ${alias} ${svc_cidr} + ${expected_ip}= Compute 11th IP ${svc_cidr} + ${actual_ip}= Oc On Cluster ${alias} + ... oc get service ${PROBE_DEPLOYMENT} -n ${C2CC_NAMESPACE} -o jsonpath='{.spec.clusterIP}' + Should Be Equal As Strings ${actual_ip} ${expected_ip} strip_spaces=True + +Verify RemoteCluster State + [Documentation] Check that all RemoteCluster CRs on this cluster have the expected state. + [Arguments] ${alias} ${expected_state} + ${stdout}= Oc On Cluster ${alias} + ... oc get remoteclusters.microshift.io -o jsonpath='{.items[*].status.state}' + Should Not Be Empty ${stdout} + @{states}= Split String ${stdout} + ${count}= Get Length ${states} + Should Be Equal As Integers ${count} 2 Expected 2 RemoteCluster states, got ${count} + FOR ${state} IN @{states} + Should Be Equal As Strings ${state} ${expected_state} + END + +Verify RemoteCluster State By Name + [Documentation] Check that a specific RemoteCluster CR has the expected state. + [Arguments] ${alias} ${cr_name} ${expected_state} + ${stdout}= Oc On Cluster + ... ${alias} + ... oc get remoteclusters.microshift.io ${cr_name} -o jsonpath='{.status.state}' + Should Be Equal As Strings ${stdout} ${expected_state} + +Get RemoteCluster Errors By Name + [Documentation] Return the errors field from a specific RemoteCluster CR. + [Arguments] ${alias} ${cr_name} + ${stdout}= Oc On Cluster + ... ${alias} + ... oc get remoteclusters.microshift.io ${cr_name} -o jsonpath='{.status.errors}' + RETURN ${stdout} + +RemoteCluster CR Name From IP + [Documentation] Compute the RemoteCluster CR name from a host IP (e.g. 192.168.1.2 -> c2cc-192-168-1-2). + [Arguments] ${ip} + ${dashed}= Replace String ${ip} . - + ${dashed}= Replace String ${dashed} : - + RETURN c2cc-${dashed} + +Ensure All Clusters Healthy + [Documentation] Pre-condition: all clusters must be Healthy before fault injection. + FOR ${alias} IN cluster-a cluster-b cluster-c + Wait Until Keyword Succeeds 3m 10s + ... Verify RemoteCluster State ${alias} Healthy + END + +Apply Probe Deny Policy + [Documentation] Apply a NetworkPolicy that denies all ingress to the probe pod. + [Arguments] ${alias} + ${policy}= Catenate SEPARATOR=\n + ... apiVersion: networking.k8s.io/v1 + ... kind: NetworkPolicy + ... metadata: + ... ${SPACE}${SPACE}name: deny-probe-ingress + ... ${SPACE}${SPACE}namespace: ${C2CC_NAMESPACE} + ... spec: + ... ${SPACE}${SPACE}podSelector: + ... ${SPACE}${SPACE}${SPACE}${SPACE}matchLabels: + ... ${SPACE}${SPACE}${SPACE}${SPACE}${SPACE}${SPACE}app: c2cc-probe + ... ${SPACE}${SPACE}policyTypes: + ... ${SPACE}${SPACE}- Ingress + Oc On Cluster ${alias} echo '${policy}' | oc apply -f - + +Delete Probe Deny Policy + [Documentation] Remove the probe deny NetworkPolicy. + [Arguments] ${alias} + Oc On Cluster ${alias} + ... oc delete networkpolicy deny-probe-ingress -n ${C2CC_NAMESPACE} --ignore-not-found