From d961e7b979f61b022ae7418b1ada6cbbc2efb169 Mon Sep 17 00:00:00 2001 From: xnoto Date: Thu, 30 Apr 2026 11:14:19 -0600 Subject: [PATCH] feat(arc): add Prometheus monitoring and Grafana dashboard --- .github/CODEOWNERS | 2 +- operators/arc/arcsystem.yaml | 5 + workloads/arc/arc-tf-application.yaml | 72 +- workloads/arc/kustomization.yaml | 1 + workloads/arc/metrics-monitoring.yaml | 85 + workloads/grafana/arc-dashboard.yaml | 2183 +++++++++++++++++++++++++ workloads/grafana/datasource.yaml | 1 + workloads/grafana/kustomization.yaml | 1 + 8 files changed, 2348 insertions(+), 2 deletions(-) create mode 100644 workloads/arc/metrics-monitoring.yaml create mode 100644 workloads/grafana/arc-dashboard.yaml diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a937197..761ceb4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @xnoto +* @makeitworkcloud/admins diff --git a/operators/arc/arcsystem.yaml b/operators/arc/arcsystem.yaml index 4601054..3daade6 100644 --- a/operators/arc/arcsystem.yaml +++ b/operators/arc/arcsystem.yaml @@ -37,6 +37,11 @@ spec: targetRevision: 0.13.1 helm: releaseName: arc + valuesObject: + metrics: + controllerManagerAddr: ":8080" + listenerAddr: ":8080" + listenerEndpoint: /metrics destination: server: "https://kubernetes.default.svc" namespace: arc-systems diff --git a/workloads/arc/arc-tf-application.yaml b/workloads/arc/arc-tf-application.yaml index 294fd63..bc38d9d 100644 --- a/workloads/arc/arc-tf-application.yaml +++ b/workloads/arc/arc-tf-application.yaml @@ -28,7 +28,77 @@ spec: githubConfigSecret: arc-runner-github-token githubConfigUrl: https://github.com/makeitworkcloud maxRunners: 3 - minRunners: 0 + minRunners: 1 + listenerTemplate: + spec: + containers: + - name: listener + ports: + - name: metrics + containerPort: 8080 + protocol: TCP + listenerMetrics: + counters: + gha_started_jobs_total: + labels: + - repository + - organization + - enterprise + - job_name + - event_name + - job_workflow_ref + - job_workflow_name + - job_workflow_target + gha_completed_jobs_total: + labels: + - repository + - organization + - enterprise + - job_name + - event_name + - job_result + - job_workflow_ref + - job_workflow_name + - job_workflow_target + gauges: + gha_assigned_jobs: + labels: [name, namespace, repository, organization, enterprise] + gha_running_jobs: + labels: [name, namespace, repository, organization, enterprise] + gha_registered_runners: + labels: [name, namespace, repository, organization, enterprise] + gha_busy_runners: + labels: [name, namespace, repository, organization, enterprise] + gha_min_runners: + labels: [name, namespace, repository, organization, enterprise] + gha_max_runners: + labels: [name, namespace, repository, organization, enterprise] + gha_desired_runners: + labels: [name, namespace, repository, organization, enterprise] + gha_idle_runners: + labels: [name, namespace, repository, organization, enterprise] + histograms: + gha_job_startup_duration_seconds: + labels: + - repository + - organization + - enterprise + - job_name + - event_name + - job_workflow_ref + - job_workflow_name + - job_workflow_target + gha_job_execution_duration_seconds: + labels: + - repository + - organization + - enterprise + - job_name + - event_name + - job_result + - job_workflow_ref + - job_workflow_name + - job_workflow_target template: spec: serviceAccountName: arc-tf-runner diff --git a/workloads/arc/kustomization.yaml b/workloads/arc/kustomization.yaml index d92610e..fe78f35 100644 --- a/workloads/arc/kustomization.yaml +++ b/workloads/arc/kustomization.yaml @@ -5,5 +5,6 @@ resources: - namespace.yaml - runner-rbac.yaml - arc-tf-application.yaml + - metrics-monitoring.yaml generators: - ksops-arc-secrets.yaml diff --git a/workloads/arc/metrics-monitoring.yaml b/workloads/arc/metrics-monitoring.yaml new file mode 100644 index 0000000..d25cb29 --- /dev/null +++ b/workloads/arc/metrics-monitoring.yaml @@ -0,0 +1,85 @@ +--- +# ARC publishes Prometheus metrics from the controller-manager and scale-set +# listener pods. Ephemeral runner pods do not expose separate ARC metrics; +# their health and resource usage are covered by the cluster's standard pod +# metrics and kube-state-metrics. +--- +apiVersion: v1 +kind: Service +metadata: + name: arc-controller-metrics + namespace: arc-systems + annotations: + argocd.argoproj.io/sync-wave: "1" + ignore-check.kube-linter.io/dangling-service: "Targets ARC chart-managed controller pods not declared in this repo" + labels: + app.kubernetes.io/component: controller-manager + app.kubernetes.io/name: gha-rs-controller +spec: + selector: + app.kubernetes.io/component: controller-manager + app.kubernetes.io/name: gha-rs-controller + ports: + - name: metrics + port: 8080 + targetPort: 8080 +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: arc-controller + namespace: arc-systems + annotations: + argocd.argoproj.io/sync-wave: "1" +spec: + namespaceSelector: + matchNames: + - arc-systems + selector: + matchLabels: + app.kubernetes.io/component: controller-manager + app.kubernetes.io/name: gha-rs-controller + endpoints: + - port: metrics + path: /metrics + interval: 30s +--- +apiVersion: v1 +kind: Service +metadata: + name: arc-tf-listener-metrics + namespace: arc-runners + annotations: + argocd.argoproj.io/sync-wave: "1" + ignore-check.kube-linter.io/dangling-service: "Targets ARC listener pods created dynamically by the scale-set controller" + labels: + actions.github.com/scale-set-name: arc-tf + app.kubernetes.io/component: runner-scale-set-listener +spec: + selector: + actions.github.com/scale-set-name: arc-tf + app.kubernetes.io/component: runner-scale-set-listener + ports: + - name: metrics + port: 8080 + targetPort: 8080 +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: arc-tf-listener + namespace: arc-runners + annotations: + argocd.argoproj.io/sync-wave: "1" +spec: + namespaceSelector: + matchNames: + - arc-runners + selector: + matchLabels: + actions.github.com/scale-set-name: arc-tf + app.kubernetes.io/component: runner-scale-set-listener + endpoints: + - port: metrics + path: /metrics + interval: 30s diff --git a/workloads/grafana/arc-dashboard.yaml b/workloads/grafana/arc-dashboard.yaml new file mode 100644 index 0000000..d02db89 --- /dev/null +++ b/workloads/grafana/arc-dashboard.yaml @@ -0,0 +1,2183 @@ +# Adapted from the upstream ARC sample dashboard: +# https://github.com/actions/actions-runner-controller/blob/8c84ab2f4267ce61df1f6b2e26a3ca12b2d1e4ca/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json +# Datasource references are pinned to the in-cluster Prometheus datasource UID used by this repo. +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: arc-autoscaling-runner-set + namespace: grafana + annotations: + argocd.argoproj.io/sync-wave: "1" +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.5.2" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": true, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 15, + "panels": [], + "title": "Runner Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Heat map showing the typical time before a job starts and whether the number of jobs in that time bucket are increasing or decreasing.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 1 + }, + "id": 7, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Turbo", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Wait Time", + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(le) (increase(gha_job_startup_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Startup Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Heat map showing the typical time to complete a job and whether the number of jobs in that time bucket are increasing or decreasing.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 7, + "y": 1 + }, + "id": 6, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Time", + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(le) (increase(gha_job_execution_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Job Execution", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of jobs assigned to the scale set. The threshold is triggered with the number of assigned jobs exceeds the number of desired runners. This indicates that not all jobs will have an available runner.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 9, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_assigned_jobs{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "DesiredRunners" + } + ], + "title": "Assigned Jobs", + "transformations": [ + { + "id": "configFromData", + "options": { + "configRefId": "DesiredRunners", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of runners desired by the scale set", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 8 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Desired Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of registered runners that do not have assigned jobs.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 8 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_idle_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Idle Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of workflow jobs currently executing", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 8 + }, + "id": 10, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (gha_running_jobs{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Running Jobs", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of runners in a failed state. These runners are typically misconfigured and count against the scale set's maximum limit.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 8 + }, + "id": 26, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(gha_controller_failed_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Failed Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of active scale set listeners", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": true, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 13 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(gha_controller_running_listeners{namespace=~\"$SystemNamespace\"})", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Listeners", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of runner pods that are waiting to be created. When this number exceeds the number of pods Kubernetes reports as Waiting, it indicate cluster performance issues.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 13 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_controller_pending_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "Waiting" + } + ], + "title": "Pending Runners", + "transformations": [ + { + "id": "configFromData", + "options": { + "configRefId": "Waiting", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of runners registered for processing queued jobs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 13 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(gha_registered_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Registered Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of runner pods in a running state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 13 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "max(gha_controller_running_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of containers that are reporting that they were terminated by an out-of-memory condition (OOMK.iller)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "No issues detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 13 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$RunnerNamespace\"}) by (namespace)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Out of Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The peak memory used by a container in a given scale set's namespace.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-green", + "mode": "shades" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 18 + }, + "id": 12, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(container_memory_working_set_bytes{namespace=~\"$RunnerNamespace\"}) by (namespace)", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Peak Container Memory", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The sum of the reads and writes occurring within the runner namespace.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 54, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 5, + "y": 18 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_writes_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", + "instant": false, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_reads_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Container I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The Kubernetes-reported pod status.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "noValue": "No active pods", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "yellow", + "value": null + }, + { + "color": "green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 11, + "y": 18 + }, + "id": 11, + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_status_ready{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "format": "time_series", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "Ready", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Waiting", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Completed\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Completed", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Error\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Error", + "range": true, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "DesiredRunners", + "useBackend": false + } + ], + "title": "Container Pod Status", + "transformations": [ + { + "id": "configFromData", + "options": { + "applyTo": { + "id": "byName", + "options": "Ready" + }, + "configRefId": "DesiredRunners", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", + "handlerKey": "threshold1" + }, + { + "fieldName": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) - 5", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 16, + "panels": [], + "title": "Controller Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The average time required for a reconciliation request to be processed. This reflects the time required for the controller to process a single request to modify a Kubernetes resource.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 33, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 25 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_time_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Reconcile Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The average time a queued reconciliation request spends waiting to be processed.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 27, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 25 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(workqueue_queue_duration_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Workqueue Queue Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Errors indicate that controller has not achieved a desired state and is requesting Kubernetes to queue another request for reconciliation. Ideally, this number remains close to zero. An increasing number can indicate resource contention or delays processing API server requests.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 33, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 25 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_errors_total{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Reconciliation Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of reconcile requests that are waiting to be processed by the controller. A growing queue depth can indicate that the Kubernetes API Server or the controller does not have enough resources. This can lead to pods taking longer to be deleted or started. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 33 + }, + "id": 20, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum (workqueue_depth{namespace=\"$SystemNamespace\"}) by (name)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Queue Depth", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of workers that are currently being used to process reconcile requests. Increasing this number can reduce the work queue duration, but each new worker adds a small amount of time due to context switching.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 33 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (controller) (controller_runtime_active_workers)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Active Workers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The number of calls to the API server", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 27, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 33 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (method, code) (rate(rest_client_requests_total{namespace=\"$SystemNamespace\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "API Calls", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 25, + "panels": [], + "title": "Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The time required by Prometheus to read and process metrics. Long scrape times can delay metrics updates or lead to metrics loss. Increasing time often indicates issues with metrics cardinality or cluster resources.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 18, + "x": 0, + "y": 42 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "scrape_duration_seconds", + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + } + } + ], + "title": "Scrape Duration", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(gha_controller_running_listeners,namespace)", + "description": "The ARC system namespace", + "includeAll": true, + "label": "ARC System Namespace", + "multi": true, + "name": "SystemNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_controller_running_listeners,namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", + "description": "The name of the runner scale set", + "includeAll": true, + "label": "Scale Set", + "multi": true, + "name": "Scaleset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", + "description": "Namespace containing the runners", + "includeAll": true, + "label": "Runner Namespace", + "multi": true, + "name": "RunnerNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d", + "7d" + ] + }, + "timezone": "", + "title": "ARC Autoscaling Runner Set Monitoring", + "uid": "arc-autoscaling-runner-set", + "version": 1, + "weekStart": "" + } diff --git a/workloads/grafana/datasource.yaml b/workloads/grafana/datasource.yaml index e2920f1..c197628 100644 --- a/workloads/grafana/datasource.yaml +++ b/workloads/grafana/datasource.yaml @@ -10,6 +10,7 @@ metadata: annotations: argocd.argoproj.io/sync-wave: "1" spec: + uid: prometheus instanceSelector: matchLabels: dashboards: "grafana" diff --git a/workloads/grafana/kustomization.yaml b/workloads/grafana/kustomization.yaml index ab6b138..1a36031 100644 --- a/workloads/grafana/kustomization.yaml +++ b/workloads/grafana/kustomization.yaml @@ -4,6 +4,7 @@ kind: Kustomization resources: - grafana.yaml - datasource.yaml + - arc-dashboard.yaml - status-grafana.yaml - status-monitoring.yaml - status-datasource.yaml