Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
325 changes: 118 additions & 207 deletions .github/workflows/slo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,275 +2,186 @@ name: SLO

on:
pull_request:
types: [opened, reopened, synchronize]
branches:
- main
workflow_dispatch:
inputs:
github_issue:
description: "GitHub issue number where the SLO results will be reported"
required: true
baseline_ref:
description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)"
required: false
slo_workload_duration_seconds:
description: "Duration of the SLO workload in seconds"
required: false
default: "600"
slo_workload_read_max_rps:
description: "Maximum read RPS for the SLO workload"
required: false
default: "1000"
slo_workload_write_max_rps:
description: "Maximum write RPS for the SLO workload"
required: false
default: "100"
types: [opened, reopened, synchronize, labeled]

jobs:
ydb-slo-action:
if: contains(github.event.pull_request.labels.*.name, 'SLO')

name: Run YDB SLO Tests
runs-on: ubuntu-latest

permissions:
contents: read

strategy:
fail-fast: false
matrix:
compiler: [clang, gcc]
include:
- workload: table
sdk:
- name: cpp-key-value
preset: release-test-clang
command: ""
Comment thread
polRk marked this conversation as resolved.

concurrency:
group: slo-${{ github.ref }}-${{ matrix.os }}-${{ matrix.workload }}-${{ matrix.compiler }}
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
cancel-in-progress: true

steps:
- name: Install dependencies
run: |
set -euxo pipefail
YQ_VERSION=v4.48.2
BUILDX_VERSION=0.30.1
COMPOSE_VERSION=2.40.3

sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \
sudo chmod +x /usr/local/bin/yq
sudo curl -fLo /usr/local/bin/yq \
"https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64"
sudo chmod +x /usr/local/bin/yq

echo "Updating Docker plugins..."
sudo mkdir -p /usr/local/lib/docker/cli-plugins

echo "Installing Docker Buildx ${BUILDX_VERSION}..."
sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \
"https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64"
sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx

echo "Installing Docker Compose ${COMPOSE_VERSION}..."
sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \
"https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64"
sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose

echo "Installed versions:"
yq --version
docker --version
docker buildx version
docker compose version

- name: Checkout current version
- name: Checkout current SDK version
uses: actions/checkout@v5
with:
path: current
path: sdk-current
fetch-depth: 0
submodules: true

- name: Determine baseline commit
id: baseline
working-directory: sdk-current
run: |
cd current
if [[ -n "${{ inputs.baseline_ref }}" ]]; then
BASELINE="${{ inputs.baseline_ref }}"
else
BASELINE=$(git merge-base HEAD origin/main)
fi
echo "sha=$BASELINE" >> $GITHUB_OUTPUT
set -euo pipefail
BASELINE=$(git merge-base HEAD origin/main)
echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT"

# Try to determine a human-readable ref name for baseline
# Check if baseline is on main
if git merge-base --is-ancestor $BASELINE origin/main && \
[ "$(git rev-parse origin/main)" = "$BASELINE" ]; then
if git merge-base --is-ancestor "${BASELINE}" origin/main && \
[ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then
BASELINE_REF="main"
else
# Try to find a branch containing this commit
BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "")
if [ -n "$BRANCH" ]; then
BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "")
if [ -n "${BRANCH}" ]; then
BASELINE_REF="${BRANCH}@${BASELINE:0:7}"
else
BASELINE_REF="${BASELINE:0:7}"
fi
fi
echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT
echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT"

- name: Checkout baseline version
- name: Checkout baseline SDK version
uses: actions/checkout@v5
with:
ref: ${{ steps.baseline.outputs.sha }}
path: baseline
path: sdk-baseline
fetch-depth: 1
submodules: true

- name: Build Workload Image
run: |
echo "Cleaning up Docker system before builds..."
docker system prune -af --volumes
docker builder prune -af
df -h

# Build current version
if [ -f "$GITHUB_WORKSPACE/current/tests/slo_workloads/Dockerfile" ]; then
echo "Building current app image..."
cd "$GITHUB_WORKSPACE/current"

# Use SLO-specific .dockerignore
cp tests/slo_workloads/.dockerignore .dockerignore

docker build -t ydb-app-current \
--build-arg REF="${{ github.head_ref || github.ref_name }}" \
--build-arg PRESET=release-test-${{ matrix.compiler }} \
-f tests/slo_workloads/Dockerfile .

# Clean up .dockerignore
rm -f .dockerignore
else
echo "No current app Dockerfile found"
exit 1
fi

docker system prune -f --volumes
docker builder prune -af

# Build baseline version
if [ -f "$GITHUB_WORKSPACE/baseline/tests/slo_workloads/Dockerfile" ]; then
echo "Building baseline app image..."
cd "$GITHUB_WORKSPACE/baseline"

# Use SLO-specific .dockerignore
cp tests/slo_workloads/.dockerignore .dockerignore
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

docker build -t ydb-app-baseline \
--build-arg REF="${{ steps.baseline.outputs.ref }}" \
--build-arg PRESET=release-test-${{ matrix.compiler }} \
-f tests/slo_workloads/Dockerfile .

# Clean up .dockerignore
rm -f .dockerignore
else
echo "No baseline app Dockerfile found"
exit 1
fi

docker system prune -f --volumes
docker builder prune -af
# Use current's workload harness (Dockerfile, sources, .dockerignore) for
# both builds so only the SDK library differs between current and
# baseline. Without this the baseline image picks up the harness from
# the merge-base commit, which can lag behind the action's contract.
# buildx also expects .dockerignore at the context root, not under
# tests/, so copy it up in each checkout.
- name: Stage workload harness
run: |
set -euxo pipefail
rm -rf sdk-baseline/tests/slo_workloads
cp -a sdk-current/tests/slo_workloads sdk-baseline/tests/slo_workloads
cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore

# `cache-to: type=gha` does NOT export `--mount=type=cache` content, so
# ccache state is lost between runs. Persist /root/.ccache via host
# directory + cache-dance: actions/cache restores the host dir, the
# dance injects it into the BuildKit cache mount before the build and
# extracts the updated state afterwards for the next save.
- name: Restore ccache
id: ccache
uses: actions/cache@v4
with:
path: ccache
key: slo-ccache-${{ matrix.sdk.preset }}-${{ github.run_id }}
restore-keys: |
slo-ccache-${{ matrix.sdk.preset }}-

echo "Final disk space after builds:"
df -h
- name: Inject ccache into BuildKit
uses: reproducible-containers/buildkit-cache-dance@v3.1.2
with:
cache-map: |
{
"ccache": "/root/.ccache"
}
# Always extract so newly-compiled TUs from this run are saved by
# actions/cache (key uses ${{ github.run_id }}, so each run gets
# its own snapshot). Without extraction the cache stays frozen at
# whatever was first persisted.
skip-extraction: false

# A clean build of the SLO image takes ~30 min because the Dockerfile
# rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.
# The GHA cache lets subsequent runs reuse every layer up to the SDK
# source COPY, so only the actual workload link step reruns (~3 min).
- name: Build current workload image
uses: docker/build-push-action@v6
with:
context: sdk-current
file: sdk-current/tests/slo_workloads/Dockerfile
platforms: linux/amd64
tags: ydb-app-current
load: true
build-args: PRESET=${{ matrix.sdk.preset }}
cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }}
cache-to: type=gha,mode=max,scope=slo-${{ matrix.sdk.preset }}

- name: Build baseline workload image
id: baseline-build
continue-on-error: true
uses: docker/build-push-action@v6
with:
context: sdk-baseline
file: sdk-baseline/tests/slo_workloads/Dockerfile
platforms: linux/amd64
tags: ydb-app-baseline
load: true
build-args: PRESET=${{ matrix.sdk.preset }}
cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }}

# If the historical commit lacks the SLO Dockerfile or can't compile,
# reuse the current image so the SLO run is still comparable against
# itself rather than failing outright.
- name: Fall back to current image for baseline
if: steps.baseline-build.outcome == 'failure'
run: |
echo "Baseline build failed; reusing current image as baseline."
docker tag ydb-app-current ydb-app-baseline

- name: Initialize YDB SLO
uses: ydb-platform/ydb-slo-action/init@main
- name: Run SLO Tests
uses: ydb-platform/ydb-slo-action/init@v2
timeout-minutes: 30
with:
github_issue: ${{ github.event.inputs.github_issue }}
github_issue: ${{ github.event.pull_request.number }}
github_token: ${{ secrets.GITHUB_TOKEN }}
workload_name: ${{ matrix.workload }}-${{ matrix.compiler }}
workload_name: ${{ matrix.sdk.name }}
workload_duration: "600"
workload_current_ref: ${{ github.head_ref || github.ref_name }}
workload_current_image: ydb-app-current
workload_current_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100
workload_baseline_ref: ${{ steps.baseline.outputs.ref }}

- name: Prepare SLO Database
run: |
echo "Preparing SLO database..."
docker run --rm --network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
ydb-app-current --connection-string grpc://ydb:2136/?database=/Root/testdb create --dont-push

- name: Run SLO Tests (parallel)
timeout-minutes: 15
run: |
DURATION=${{ inputs.slo_workload_duration_seconds || 600 }}
READ_RPS=${{ inputs.slo_workload_read_max_rps || 1000 }}
WRITE_RPS=${{ inputs.slo_workload_write_max_rps || 100 }}

ARGS="--connection-string grpc://ydb:2136/?database=/Root/testdb run \
--metrics-push-url http://prometheus:9090/api/v1/otlp/v1/metrics \
--time $DURATION \
--read-rps $READ_RPS \
--write-rps $WRITE_RPS \
--read-timeout 100 \
--write-timeout 100"

echo "Starting ydb-app-current..."
docker run -d \
--name ydb-app-current \
--network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
ydb-app-current $ARGS

echo "Starting ydb-app-baseline..."
docker run -d \
--name ydb-app-baseline \
--network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
ydb-app-baseline $ARGS

# Show initial logs
echo ""
echo "==================== INITIAL CURRENT LOGS ===================="
docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
echo ""
echo "==================== INITIAL BASELINE LOGS ===================="
docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
echo ""

# Wait for workloads to complete
echo "Waiting for workloads to complete (${DURATION}s)..."
sleep ${DURATION}

# Stop containers after workload duration and wait for graceful shutdown
echo "Stopping containers after ${DURATION}s..."
docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true

# Force kill if still running
docker kill ydb-app-current ydb-app-baseline 2>&1 || true

# Check exit codes
CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")
BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "0")

echo "Current container exit code: $CURRENT_EXIT"
echo "Baseline container exit code: $BASELINE_EXIT"

# Show final logs
echo ""
echo "==================== FINAL CURRENT LOGS ===================="
docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
echo ""
echo "==================== FINAL BASELINE LOGS ===================="
docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
echo ""

echo "SUCCESS: Workloads completed successfully"

- if: always()
name: Store logs
run: |
docker logs ydb-app-current > current.log 2>&1 || echo "No current container"
docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container"

- if: always()
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.workload }}-${{ matrix.compiler }}-slo-cpp-sdk-logs
path: |
./current.log
./baseline.log
retention-days: 1
workload_baseline_image: ydb-app-baseline
workload_baseline_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100
Loading
Loading