From 65fc7c283472f321312ee20eb6021f5f69bc6f20 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 16:40:35 +0300 Subject: [PATCH 01/11] tests/slo_workloads: align with ydb-slo-action v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v2 SLO action (ydb-platform/ydb-slo-action) starts a workload container once per run, passes config via env vars (WORKLOAD_REF, WORKLOAD_DURATION, YDB_CONNECTION_STRING, ...), and queries sdk_operation_latency_p{50,95,99}_seconds as pre-computed gauges labeled by (operation_type, operation_status, ref). Changes: - Dockerfile: drop SLO_BRANCH_REF compile-time ref baking; a single ENTRYPOINT binary without CMD is now sufficient. - utils/utils.cpp: subcommand is optional — with no free-arg the binary runs create -> run -> cleanup sequentially (cleanup always runs, even if run errored). Option resolution follows the standard CLI > env > default priority via .DefaultValue(): - -c from YDB_CONNECTION_STRING (or built from YDB_ENDPOINT + YDB_DATABASE); - --metrics-push-url from OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; - --time from WORKLOAD_DURATION. - utils/metrics.cpp: replace the OTel Histogram with HDR-backed gauges. Only successful operations are recorded; a background thread snapshots p50/p95/p99 every second, publishes the gauges with operation_status="success", then resets the HDR window. sdk_retry_attempts_total becomes a counter of (retry_attempts + 1) per op. ref is read from WORKLOAD_REF at startup, not compile time. - utils/CMakeLists.txt: pull HdrHistogram_c 0.11.8 via FetchContent (opt-in via the outer YDB_SDK_TESTS flag) and link slo-utils against hdr_histogram_static. --- tests/slo_workloads/Dockerfile | 3 +- tests/slo_workloads/utils/CMakeLists.txt | 19 +- tests/slo_workloads/utils/metrics.cpp | 239 ++++++++++++++--------- tests/slo_workloads/utils/utils.cpp | 70 ++++++- tests/slo_workloads/utils/utils.h | 3 +- 5 files changed, 237 insertions(+), 97 deletions(-) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 091ce23066..65d9c7693b 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -1,7 +1,6 @@ FROM ubuntu:22.04 ARG PRESET=release-test-clang -ARG REF=unknown # Install software-properties-common for add-apt-repository RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test @@ -120,7 +119,7 @@ COPY . /ydb-cpp-sdk WORKDIR /ydb-cpp-sdk RUN rm -rf build -RUN cmake -DSLO_BRANCH_REF=${REF} --preset ${PRESET} +RUN cmake --preset ${PRESET} RUN cmake --build --preset default --target slo-key-value ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"] diff --git a/tests/slo_workloads/utils/CMakeLists.txt b/tests/slo_workloads/utils/CMakeLists.txt index e8589a568f..5434af9bb4 100644 --- a/tests/slo_workloads/utils/CMakeLists.txt +++ b/tests/slo_workloads/utils/CMakeLists.txt @@ -1,3 +1,16 @@ +include(FetchContent) + +FetchContent_Declare( + hdr_histogram + GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git + GIT_TAG 0.11.8 + EXCLUDE_FROM_ALL +) +set(HDR_HISTOGRAM_BUILD_PROGRAMS OFF CACHE BOOL "" FORCE) +set(HDR_HISTOGRAM_BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(HDR_LOG_REQUIRED OFF CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(hdr_histogram) + add_library(slo-utils) target_link_libraries(slo-utils PUBLIC @@ -9,9 +22,9 @@ target_link_libraries(slo-utils PUBLIC opentelemetry-cpp::otlp_http_metric_exporter ) -if (SLO_BRANCH_REF) - target_compile_definitions(slo-utils PRIVATE REF=${SLO_BRANCH_REF}) -endif() +target_link_libraries(slo-utils PRIVATE + hdr_histogram_static +) target_sources(slo-utils PRIVATE executor.cpp diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 50e1f859c0..996a48780f 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -11,21 +11,93 @@ #include +#include + +#include + +#include +#include +#include +#include +#include + using namespace std::chrono_literals; -#ifdef REF -static constexpr const std::string_view REF_LABEL = Y_STRINGIZE(REF); -#else -static constexpr const std::string_view REF_LABEL = "unknown"; -#endif +namespace { + +constexpr std::int64_t kHdrMinLatencyNs = 1'000; // 1 us +constexpr std::int64_t kHdrMaxLatencyNs = 60'000'000'000; // 60 s +constexpr int kHdrSignificantFigures = 3; + +std::string ResolveWorkloadRef() { + std::string ref = GetEnv("WORKLOAD_REF"); + return ref.empty() ? "unknown" : ref; +} + +// Minimal thread-safe wrapper around hdr_histogram for a single +// (operation_type, operation_status="success") series. Only successful +// latencies are recorded; errors are excluded from the percentile stream +// per deploy/metrics.yaml. +class TLatencyRecorder { +public: + TLatencyRecorder() { + hdr_histogram* raw = nullptr; + int rc = hdr_init(kHdrMinLatencyNs, kHdrMaxLatencyNs, kHdrSignificantFigures, &raw); + Y_ABORT_UNLESS(rc == 0, "hdr_init failed: %d", rc); + Histogram_.reset(raw); + } + + void Record(TDuration d) { + std::int64_t ns = static_cast(d.NanoSeconds()); + if (ns < kHdrMinLatencyNs) { + ns = kHdrMinLatencyNs; + } else if (ns > kHdrMaxLatencyNs) { + ns = kHdrMaxLatencyNs; + } + std::lock_guard lock(Mutex_); + hdr_record_value(Histogram_.get(), ns); + } + + // Returns p50/p95/p99 as seconds and resets the recorder window so + // gauges reflect only the most recent interval. + struct TPercentiles { + double P50 = 0.0; + double P95 = 0.0; + double P99 = 0.0; + bool HasData = false; + }; + + TPercentiles SnapshotAndReset() { + TPercentiles out; + std::lock_guard lock(Mutex_); + if (Histogram_->total_count == 0) { + return out; + } + out.HasData = true; + out.P50 = hdr_value_at_percentile(Histogram_.get(), 50.0) / 1e9; + out.P95 = hdr_value_at_percentile(Histogram_.get(), 95.0) / 1e9; + out.P99 = hdr_value_at_percentile(Histogram_.get(), 99.0) / 1e9; + hdr_reset(Histogram_.get()); + return out; + } + +private: + struct THdrDeleter { + void operator()(hdr_histogram* h) const noexcept { if (h) hdr_close(h); } + }; + + std::mutex Mutex_; + std::unique_ptr Histogram_; +}; class TOtelMetricsPusher : public IMetricsPusher { public: TOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) : OperationType_(operationType) + , Ref_(ResolveWorkloadRef()) , CommonAttributes_{ - {"ref", std::string(REF_LABEL)}, + {"ref", Ref_}, {"sdk", "cpp"}, {"sdk_version", NYdb::GetSdkSemver()} } @@ -36,12 +108,11 @@ class TOtelMetricsPusher : public IMetricsPusher { auto exporter = opentelemetry::exporter::otlp::OtlpHttpMetricExporterFactory::Create(exporterOptions); opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions readerOptions; - readerOptions.export_interval_millis = 250ms; - readerOptions.export_timeout_millis = 200ms; + readerOptions.export_interval_millis = 1000ms; + readerOptions.export_timeout_millis = 900ms; auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions); - // Create MeterContext with resource auto context = std::make_unique( std::unique_ptr(new opentelemetry::sdk::metrics::ViewRegistry()), opentelemetry::sdk::resource::Resource::Create(opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_)) @@ -53,97 +124,85 @@ class TOtelMetricsPusher : public IMetricsPusher { Meter_ = MeterProvider_->GetMeter("slo_workloads", NYdb::GetSdkSemver()); InitMetrics(); + StartPercentilePublisher(); + } + + ~TOtelMetricsPusher() override { + PublisherShouldStop_.store(true); + if (PublisherThread_.joinable()) { + PublisherThread_.join(); + } } void PushRequestData(const TRequestData& requestData) override { - if (requestData.Status == NYdb::EStatus::SUCCESS) { - OperationsSuccessTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}})); - } else { - ErrorsTotal_->Add(1, MergeAttributes({{"status", YdbStatusToString(requestData.Status)}})); - OperationsFailureTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}})); + const bool success = requestData.Status == NYdb::EStatus::SUCCESS; + const std::string status = success ? "success" : "error"; + + OperationsTotal_->Add(1, MergeAttributes({ + {"operation_type", OperationType_}, + {"operation_status", status}, + })); + + // sdk_retry_attempts_total = total number of technical attempts + // including the first one. TStatUnit counts only post-first attempts, + // so add 1 to include the initial attempt. + RetryAttemptsTotal_->Add(static_cast(requestData.RetryAttempts + 1), + MergeAttributes({ + {"operation_type", OperationType_}, + }) + ); + + if (success) { + Latency_.Record(requestData.Delay); } - OperationsTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}})); - OperationLatencySeconds_->Record(requestData.Delay.SecondsFloat(), MergeAttributes({{"operation_type", OperationType_}, {"status", YdbStatusToString(requestData.Status)}})); - RetryAttempts_->Record(requestData.RetryAttempts, MergeAttributes({{"operation_type", OperationType_}})); } private: void InitMetrics() { - ErrorsTotal_ = Meter_->CreateUInt64Counter("sdk_errors_total", - "Total number of errors encountered, categorized by error type." + OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total", + "Total number of operations, categorized by operation type and status." ); - OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total", - "Total number of operations, categorized by type attempted by the SDK." - ); - - OperationsSuccessTotal_ = Meter_->CreateUInt64Counter("sdk_operations_success_total", - "Total number of successful operations, categorized by type." + RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total", + "Total number of retry attempts (including the first attempt), categorized by operation type." ); - OperationsFailureTotal_ = Meter_->CreateUInt64Counter("sdk_operations_failure_total", - "Total number of failed operations, categorized by type." + LatencyP50_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p50_seconds", + "P50 latency of successful operations in seconds.", "s" ); - - OperationLatencySeconds_ = CreateDoubleHistogram("sdk_operation_latency_seconds", - "Latency of operations performed by the SDK in seconds, categorized by type and status.", - { - 0.001, // 1 ms - 0.002, // 2 ms - 0.003, // 3 ms - 0.004, // 4 ms - 0.005, // 5 ms - 0.0075, // 7.5 ms - 0.010, // 10 ms - 0.020, // 20 ms - 0.050, // 50 ms - 0.100, // 100 ms - 0.200, // 200 ms - 0.500, // 500 ms - 1.000, // 1 s - }, - "s" + LatencyP95_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p95_seconds", + "P95 latency of successful operations in seconds.", "s" ); - - RetryAttempts_ = Meter_->CreateInt64Gauge("sdk_retry_attempts", - "Current retry attempts, categorized by operation type." + LatencyP99_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p99_seconds", + "P99 latency of successful operations in seconds.", "s" ); } - std::unique_ptr> CreateDoubleHistogram( - const std::string& name, - const std::string& description, - const std::vector& buckets, - const std::string& unit = {}) - { - auto selector = std::make_unique( - opentelemetry::sdk::metrics::InstrumentType::kHistogram, - name, - unit - ); - - auto meterSelector = std::make_unique( - "slo_workloads", - NYdb::GetSdkSemver(), - "" - ); - - auto histogramConfig = std::make_shared(); - histogramConfig->boundaries_ = buckets; - - auto view = std::make_unique( - "", - "", - opentelemetry::sdk::metrics::AggregationType::kHistogram, - histogramConfig - ); - - MeterProvider_->AddView(std::move(selector), std::move(meterSelector), std::move(view)); + void StartPercentilePublisher() { + PublisherThread_ = std::thread([this]() { + while (!PublisherShouldStop_.load(std::memory_order_relaxed)) { + std::this_thread::sleep_for(1s); + PublishPercentiles(); + } + // Final flush before exit. + PublishPercentiles(); + }); + } - return Meter_->CreateDoubleHistogram(name, description, unit); + void PublishPercentiles() { + auto snapshot = Latency_.SnapshotAndReset(); + if (!snapshot.HasData) { + return; + } + auto attrs = MergeAttributes({ + {"operation_type", OperationType_}, + {"operation_status", "success"}, + }); + LatencyP50_->Record(snapshot.P50, attrs); + LatencyP95_->Record(snapshot.P95, attrs); + LatencyP99_->Record(snapshot.P99, attrs); } - // Helper to merge common attributes with metric-specific ones std::map MergeAttributes(const std::map& metricAttrs) const { std::map result = CommonAttributes_; result.insert(metricAttrs.begin(), metricAttrs.end()); @@ -151,17 +210,21 @@ class TOtelMetricsPusher : public IMetricsPusher { } std::string OperationType_; - std::map CommonAttributes_; // ref, sdk, sdk_version + std::string Ref_; + std::map CommonAttributes_; std::unique_ptr MeterProvider_; std::shared_ptr Meter_; - std::unique_ptr> ErrorsTotal_; - std::unique_ptr> OperationsTotal_; - std::unique_ptr> OperationsSuccessTotal_; - std::unique_ptr> OperationsFailureTotal_; - std::unique_ptr> OperationLatencySeconds_; - std::unique_ptr> RetryAttempts_; + std::unique_ptr> OperationsTotal_; + std::unique_ptr> RetryAttemptsTotal_; + std::unique_ptr> LatencyP50_; + std::unique_ptr> LatencyP95_; + std::unique_ptr> LatencyP99_; + + TLatencyRecorder Latency_; + std::thread PublisherThread_; + std::atomic PublisherShouldStop_{false}; }; class TNoopMetricsPusher : public IMetricsPusher { @@ -169,6 +232,8 @@ class TNoopMetricsPusher : public IMetricsPusher { void PushRequestData([[maybe_unused]] const TRequestData& requestData) override {} }; +} // namespace + std::unique_ptr CreateOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) { return std::make_unique(metricsPushUrl, operationType); } diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index b7572c2981..78231c8e1c 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -110,6 +112,19 @@ std::string GetDatabase(const std::string& connectionString) { return {}; } +static std::string DefaultConnectionStringFromEnv() { + std::string cs = GetEnv("YDB_CONNECTION_STRING"); + if (!cs.empty()) { + return cs; + } + std::string endpoint = GetEnv("YDB_ENDPOINT"); + std::string database = GetEnv("YDB_DATABASE"); + if (!endpoint.empty() && !database.empty()) { + return TStringBuilder() << endpoint << "/?database=" << database; + } + return {}; +} + int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TCleanupCommand cleanup) { TOpts opts = TOpts::Default(); @@ -121,8 +136,15 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean std::string statConfigFile; std::string balancingPolicy; - opts.AddLongOption('c', "connection-string", "YDB connection string").Required().RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE") + std::string defaultConnectionString = DefaultConnectionStringFromEnv(); + + auto& connOpt = opts.AddLongOption('c', "connection-string", "YDB connection string").RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE") .StoreResult(&connectionString); + if (!defaultConnectionString.empty()) { + connOpt.DefaultValue(defaultConnectionString); + } else { + connOpt.Required(); + } opts.AddLongOption('p', "prefix", "Base prefix for tables").RequiredArgument("PATH") .StoreResult(&prefix); opts.AddLongOption('k', "token", "security token").RequiredArgument("TOKEN") @@ -136,7 +158,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean opts.AddLongOption('b', "balancing-policy", "Balancing policy").Optional().DefaultValue("use-all-nodes").RequiredArgument("(use-all-nodes|prefer-local-dc|prefer-primary-pile)") .StoreResult(&balancingPolicy); opts.AddHelpOption('h'); - opts.SetFreeArgsMin(1); + opts.SetFreeArgsMin(0); opts.SetFreeArgTitle(0, "", GetCmdList()); opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER; @@ -144,7 +166,8 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean size_t freeArgsPos = res.GetFreeArgsPos(); argc -= freeArgsPos; argv += freeArgsPos; - ECommandType command = ParseCommand(*argv); + + ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All; if (command == ECommandType::Unknown) { Cerr << "Unknown command '" << *argv << "'" << Endl; return EXIT_FAILURE; @@ -202,6 +225,28 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean Cout << "Launching cleanup command..." << Endl; result = cleanup(dbOptions, argc); break; + case ECommandType::All: { + Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl; + // Synthesize argv with a fake program name so the inner NLastGetopt + // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0] + // as the program name and parse zero real args. + char programName[] = "slo"; + char* fakeArgv[] = { programName, nullptr }; + int fakeArgc = 1; + + Cout << "[all] Launching create command..." << Endl; + result = create(dbOptions, fakeArgc, fakeArgv); + if (!result) { + Cout << "[all] Launching run command..." << Endl; + result = run(dbOptions, fakeArgc, fakeArgv); + } + Cout << "[all] Launching cleanup command..." << Endl; + int cleanupRc = cleanup(dbOptions, fakeArgc); + if (!result) { + result = cleanupRc; + } + break; + } default: Cerr << "Unknown command" << Endl; return EXIT_FAILURE; @@ -216,7 +261,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean } std::string GetCmdList() { - return "create, run, cleanup"; + return "create, run, cleanup (omit to run create -> run -> cleanup in one process)"; } ECommandType ParseCommand(const char* cmd) { @@ -425,6 +470,11 @@ TTableStats GetTableStats(TDatabaseOptions& dbOptions, const std::string& tableN } void ParseOptionsCommon(TOpts& opts, TCommonOptions& options) { + std::string metricsPushUrlFromEnv = GetEnv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"); + if (!metricsPushUrlFromEnv.empty()) { + options.MetricsPushUrl = metricsPushUrlFromEnv; + } + opts.AddLongOption("threads", "Number of threads to use").RequiredArgument("NUM") .DefaultValue(options.MaxInputThreads).StoreResult(&options.MaxInputThreads); opts.AddLongOption("stop-on-error", "Stop thread if an error occured").NoArgument() @@ -485,6 +535,18 @@ bool ParseOptionsCreate(int argc, char** argv, TCreateOptions& createOptions) { bool ParseOptionsRun(int argc, char** argv, TRunOptions& runOptions) { TOpts opts = TOpts::Default(); ParseOptionsCommon(opts, runOptions.CommonOptions); + + if (std::string workloadDuration = GetEnv("WORKLOAD_DURATION"); !workloadDuration.empty()) { + try { + std::uint32_t parsed = FromString(workloadDuration); + if (parsed > 0) { + runOptions.CommonOptions.SecondsToRun = parsed; + } + } catch (const std::exception& e) { + Cerr << "Invalid WORKLOAD_DURATION env value '" << workloadDuration << "': " << e.what() << Endl; + } + } + opts.AddLongOption("time", "Time to run (Seconds)").RequiredArgument("Seconds") .DefaultValue(runOptions.CommonOptions.SecondsToRun).StoreResult(&runOptions.CommonOptions.SecondsToRun); opts.AddLongOption("read-rps", "Request generation rate for read requests (Thread A)").RequiredArgument("NUM") diff --git a/tests/slo_workloads/utils/utils.h b/tests/slo_workloads/utils/utils.h index 65be9f4891..3eb3c48978 100644 --- a/tests/slo_workloads/utils/utils.h +++ b/tests/slo_workloads/utils/utils.h @@ -98,7 +98,8 @@ enum class ECommandType { Unknown, Create, Run, - Cleanup + Cleanup, + All, // No free-arg passed: execute Create -> Run -> Cleanup in one process }; struct TTableStats { From 4d8f5385b6954e23111c1bea143367c852114a17 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 17:07:41 +0300 Subject: [PATCH 02/11] ci(slo): delegate to ydb-slo-action/init@v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the SLO workflows following the pattern used by ydb-java-sdk (ydb-platform/ydb-java-sdk#644). slo.yml: - Drop the hand-rolled docker run orchestration that spun up YDB and invoked the workload with --dont-push / explicit create/run phases. The v2 action owns that lifecycle via deploy/compose.yml — we just hand it two prebuilt images. - Gate on the `SLO` PR label. - Build both images with a single `docker build` per ref; if the baseline commit can't be built (missing Dockerfile or compile error on a historical SHA), fall back to the current image so the run is comparable against itself rather than silently failing. - Drop `--build-arg REF=…` — ref is now read from WORKLOAD_REF env at runtime. - Rename matrix entry to `cpp-key-value` to match the built binary and collapse the per-compiler matrix to a single clang entry (gcc variant can be added back when needed). slo_report.yml: - Pin to @v2. - Add a second job that removes the `SLO` label from the PR after the report is published, matching js-sdk/java-sdk/go-sdk. --- .github/workflows/slo.yml | 278 ++++++++----------------------- .github/workflows/slo_report.yml | 25 ++- 2 files changed, 95 insertions(+), 208 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 0618526617..4cdf4bfec8 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -2,275 +2,143 @@ name: SLO on: pull_request: - types: [opened, reopened, synchronize] - branches: - - main - workflow_dispatch: - inputs: - github_issue: - description: "GitHub issue number where the SLO results will be reported" - required: true - baseline_ref: - description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)" - required: false - slo_workload_duration_seconds: - description: "Duration of the SLO workload in seconds" - required: false - default: "600" - slo_workload_read_max_rps: - description: "Maximum read RPS for the SLO workload" - required: false - default: "1000" - slo_workload_write_max_rps: - description: "Maximum write RPS for the SLO workload" - required: false - default: "100" + types: [opened, reopened, synchronize, labeled] jobs: ydb-slo-action: + if: contains(github.event.pull_request.labels.*.name, 'SLO') + name: Run YDB SLO Tests runs-on: ubuntu-latest + permissions: + contents: read + strategy: + fail-fast: false matrix: - compiler: [clang, gcc] - include: - - workload: table + sdk: + - name: cpp-key-value + preset: release-test-clang + command: "" concurrency: - group: slo-${{ github.ref }}-${{ matrix.os }}-${{ matrix.workload }}-${{ matrix.compiler }} + group: slo-${{ github.ref }}-${{ matrix.sdk.name }} cancel-in-progress: true steps: - name: Install dependencies run: | + set -euxo pipefail YQ_VERSION=v4.48.2 BUILDX_VERSION=0.30.1 COMPOSE_VERSION=2.40.3 - sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \ - sudo chmod +x /usr/local/bin/yq + sudo curl -fLo /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" + sudo chmod +x /usr/local/bin/yq - echo "Updating Docker plugins..." sudo mkdir -p /usr/local/lib/docker/cli-plugins - echo "Installing Docker Buildx ${BUILDX_VERSION}..." sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \ "https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64" sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx - echo "Installing Docker Compose ${COMPOSE_VERSION}..." sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \ "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64" sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose - echo "Installed versions:" yq --version docker --version docker buildx version docker compose version - - name: Checkout current version + - name: Checkout current SDK version uses: actions/checkout@v5 with: - path: current + path: sdk-current fetch-depth: 0 submodules: true - name: Determine baseline commit id: baseline + working-directory: sdk-current run: | - cd current - if [[ -n "${{ inputs.baseline_ref }}" ]]; then - BASELINE="${{ inputs.baseline_ref }}" - else - BASELINE=$(git merge-base HEAD origin/main) - fi - echo "sha=$BASELINE" >> $GITHUB_OUTPUT + set -euo pipefail + BASELINE=$(git merge-base HEAD origin/main) + echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT" - # Try to determine a human-readable ref name for baseline - # Check if baseline is on main - if git merge-base --is-ancestor $BASELINE origin/main && \ - [ "$(git rev-parse origin/main)" = "$BASELINE" ]; then + if git merge-base --is-ancestor "${BASELINE}" origin/main && \ + [ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then BASELINE_REF="main" else - # Try to find a branch containing this commit - BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "") - if [ -n "$BRANCH" ]; then + BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "") + if [ -n "${BRANCH}" ]; then BASELINE_REF="${BRANCH}@${BASELINE:0:7}" else BASELINE_REF="${BASELINE:0:7}" fi fi - echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT + echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT" - - name: Checkout baseline version + - name: Checkout baseline SDK version uses: actions/checkout@v5 with: ref: ${{ steps.baseline.outputs.sha }} - path: baseline + path: sdk-baseline fetch-depth: 1 submodules: true - - name: Build Workload Image + - name: Build current workload image + working-directory: sdk-current run: | - echo "Cleaning up Docker system before builds..." - docker system prune -af --volumes - docker builder prune -af - df -h - - # Build current version - if [ -f "$GITHUB_WORKSPACE/current/tests/slo_workloads/Dockerfile" ]; then - echo "Building current app image..." - cd "$GITHUB_WORKSPACE/current" - - # Use SLO-specific .dockerignore - cp tests/slo_workloads/.dockerignore .dockerignore - - docker build -t ydb-app-current \ - --build-arg REF="${{ github.head_ref || github.ref_name }}" \ - --build-arg PRESET=release-test-${{ matrix.compiler }} \ - -f tests/slo_workloads/Dockerfile . - - # Clean up .dockerignore - rm -f .dockerignore - else - echo "No current app Dockerfile found" - exit 1 + set -euxo pipefail + cp tests/slo_workloads/.dockerignore .dockerignore + docker build \ + --platform linux/amd64 \ + --build-arg PRESET=${{ matrix.sdk.preset }} \ + -t ydb-app-current \ + -f tests/slo_workloads/Dockerfile \ + . + rm -f .dockerignore + + - name: Build baseline workload image + working-directory: sdk-baseline + run: | + set -euxo pipefail + # If the historical commit lacks the SLO workload files or can't + # compile, fall back to the current image so the SLO run is still + # comparable against itself rather than silently failing. + if [ ! -f tests/slo_workloads/Dockerfile ]; then + echo "Baseline commit has no SLO Dockerfile; reusing current image" + docker tag ydb-app-current ydb-app-baseline + exit 0 fi - docker system prune -f --volumes - docker builder prune -af - - # Build baseline version - if [ -f "$GITHUB_WORKSPACE/baseline/tests/slo_workloads/Dockerfile" ]; then - echo "Building baseline app image..." - cd "$GITHUB_WORKSPACE/baseline" - - # Use SLO-specific .dockerignore - cp tests/slo_workloads/.dockerignore .dockerignore - - docker build -t ydb-app-baseline \ - --build-arg REF="${{ steps.baseline.outputs.ref }}" \ - --build-arg PRESET=release-test-${{ matrix.compiler }} \ - -f tests/slo_workloads/Dockerfile . - - # Clean up .dockerignore - rm -f .dockerignore - else - echo "No baseline app Dockerfile found" - exit 1 + cp tests/slo_workloads/.dockerignore .dockerignore + if ! docker build \ + --platform linux/amd64 \ + --build-arg PRESET=${{ matrix.sdk.preset }} \ + -t ydb-app-baseline \ + -f tests/slo_workloads/Dockerfile \ + . + then + echo "Baseline build failed; reusing current image" + docker tag ydb-app-current ydb-app-baseline fi + rm -f .dockerignore - docker system prune -f --volumes - docker builder prune -af - - echo "Final disk space after builds:" - df -h - - - name: Initialize YDB SLO - uses: ydb-platform/ydb-slo-action/init@main + - name: Run SLO Tests + uses: ydb-platform/ydb-slo-action/init@v2 + timeout-minutes: 30 with: - github_issue: ${{ github.event.inputs.github_issue }} + github_issue: ${{ github.event.pull_request.number }} github_token: ${{ secrets.GITHUB_TOKEN }} - workload_name: ${{ matrix.workload }}-${{ matrix.compiler }} + workload_name: ${{ matrix.sdk.name }} + workload_duration: "600" workload_current_ref: ${{ github.head_ref || github.ref_name }} + workload_current_image: ydb-app-current + workload_current_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100 workload_baseline_ref: ${{ steps.baseline.outputs.ref }} - - - name: Prepare SLO Database - run: | - echo "Preparing SLO database..." - docker run --rm --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - ydb-app-current --connection-string grpc://ydb:2136/?database=/Root/testdb create --dont-push - - - name: Run SLO Tests (parallel) - timeout-minutes: 15 - run: | - DURATION=${{ inputs.slo_workload_duration_seconds || 600 }} - READ_RPS=${{ inputs.slo_workload_read_max_rps || 1000 }} - WRITE_RPS=${{ inputs.slo_workload_write_max_rps || 100 }} - - ARGS="--connection-string grpc://ydb:2136/?database=/Root/testdb run \ - --metrics-push-url http://prometheus:9090/api/v1/otlp/v1/metrics \ - --time $DURATION \ - --read-rps $READ_RPS \ - --write-rps $WRITE_RPS \ - --read-timeout 100 \ - --write-timeout 100" - - echo "Starting ydb-app-current..." - docker run -d \ - --name ydb-app-current \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - ydb-app-current $ARGS - - echo "Starting ydb-app-baseline..." - docker run -d \ - --name ydb-app-baseline \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - ydb-app-baseline $ARGS - - # Show initial logs - echo "" - echo "==================== INITIAL CURRENT LOGS ====================" - docker logs -n 15 ydb-app-current 2>&1 || echo "No current container" - echo "" - echo "==================== INITIAL BASELINE LOGS ====================" - docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container" - echo "" - - # Wait for workloads to complete - echo "Waiting for workloads to complete (${DURATION}s)..." - sleep ${DURATION} - - # Stop containers after workload duration and wait for graceful shutdown - echo "Stopping containers after ${DURATION}s..." - docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true - - # Force kill if still running - docker kill ydb-app-current ydb-app-baseline 2>&1 || true - - # Check exit codes - CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1") - BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "0") - - echo "Current container exit code: $CURRENT_EXIT" - echo "Baseline container exit code: $BASELINE_EXIT" - - # Show final logs - echo "" - echo "==================== FINAL CURRENT LOGS ====================" - docker logs -n 15 ydb-app-current 2>&1 || echo "No current container" - echo "" - echo "==================== FINAL BASELINE LOGS ====================" - docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container" - echo "" - - echo "SUCCESS: Workloads completed successfully" - - - if: always() - name: Store logs - run: | - docker logs ydb-app-current > current.log 2>&1 || echo "No current container" - docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container" - - - if: always() - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.workload }}-${{ matrix.compiler }}-slo-cpp-sdk-logs - path: | - ./current.log - ./baseline.log - retention-days: 1 + workload_baseline_image: ydb-app-baseline + workload_baseline_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100 diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml index 0a7c2e3483..b2bbee5172 100644 --- a/.github/workflows/slo_report.yml +++ b/.github/workflows/slo_report.yml @@ -7,17 +7,36 @@ on: - completed jobs: - ydb-slo-action-report: + publish-slo-report: + if: github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest name: Publish YDB SLO Report permissions: checks: write contents: read pull-requests: write - if: github.event.workflow_run.conclusion == 'success' steps: - name: Publish YDB SLO Report - uses: ydb-platform/ydb-slo-action/report@main + uses: ydb-platform/ydb-slo-action/report@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} github_run_id: ${{ github.event.workflow_run.id }} + + remove-slo-label: + if: github.event.workflow_run.event == 'pull_request' + runs-on: ubuntu-latest + name: Remove SLO Label + permissions: + pull-requests: write + steps: + - name: Remove SLO label from PR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} + REPO: ${{ github.event.workflow_run.repository.full_name }} + run: | + set -euo pipefail + PR=$(jq -r '.[0].number' <<<"$PRS") + if [ "$PR" != "null" ] && [ -n "$PR" ]; then + gh pr edit "$PR" --repo "$REPO" --remove-label SLO + fi From 2d9a928d10ab416c7c2b92c1bbe1e7759367db31 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 17:11:59 +0300 Subject: [PATCH 03/11] tests/slo_workloads: address Copilot review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Counters: switch OperationsTotal/RetryAttemptsTotal from DoubleCounter to UInt64Counter. Operation counts and retry attempts are inherently integer; float representation could drift for very high cumulative values and doesn't match the Prometheus counter convention. - Percentile gauges: publish 0.0 on empty-window intervals instead of returning early. OTel's sync Gauge holds the last Record() value and the periodic exporter re-emits it every collection cycle — without an explicit reset, gauges would look "stuck" at the last non-empty value after load stops, contradicting the per-second HDR reset semantics. --- tests/slo_workloads/utils/metrics.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 996a48780f..4506c1ca39 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -138,7 +138,7 @@ class TOtelMetricsPusher : public IMetricsPusher { const bool success = requestData.Status == NYdb::EStatus::SUCCESS; const std::string status = success ? "success" : "error"; - OperationsTotal_->Add(1, MergeAttributes({ + OperationsTotal_->Add(uint64_t{1}, MergeAttributes({ {"operation_type", OperationType_}, {"operation_status", status}, })); @@ -146,7 +146,7 @@ class TOtelMetricsPusher : public IMetricsPusher { // sdk_retry_attempts_total = total number of technical attempts // including the first one. TStatUnit counts only post-first attempts, // so add 1 to include the initial attempt. - RetryAttemptsTotal_->Add(static_cast(requestData.RetryAttempts + 1), + RetryAttemptsTotal_->Add(requestData.RetryAttempts + 1, MergeAttributes({ {"operation_type", OperationType_}, }) @@ -159,11 +159,11 @@ class TOtelMetricsPusher : public IMetricsPusher { private: void InitMetrics() { - OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total", + OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total", "Total number of operations, categorized by operation type and status." ); - RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total", + RetryAttemptsTotal_ = Meter_->CreateUInt64Counter("sdk_retry_attempts_total", "Total number of retry attempts (including the first attempt), categorized by operation type." ); @@ -191,16 +191,18 @@ class TOtelMetricsPusher : public IMetricsPusher { void PublishPercentiles() { auto snapshot = Latency_.SnapshotAndReset(); - if (!snapshot.HasData) { - return; - } auto attrs = MergeAttributes({ {"operation_type", OperationType_}, {"operation_status", "success"}, }); - LatencyP50_->Record(snapshot.P50, attrs); - LatencyP95_->Record(snapshot.P95, attrs); - LatencyP99_->Record(snapshot.P99, attrs); + // When no successful ops landed in the last second, publish 0.0 + // for all percentiles so the gauges reset with the HDR window + // rather than appearing "stuck" at the last non-empty value (the + // OTel periodic exporter would otherwise re-emit the previous + // Record() value on every collection cycle). + LatencyP50_->Record(snapshot.HasData ? snapshot.P50 : 0.0, attrs); + LatencyP95_->Record(snapshot.HasData ? snapshot.P95 : 0.0, attrs); + LatencyP99_->Record(snapshot.HasData ? snapshot.P99 : 0.0, attrs); } std::map MergeAttributes(const std::map& metricAttrs) const { @@ -216,8 +218,8 @@ class TOtelMetricsPusher : public IMetricsPusher { std::unique_ptr MeterProvider_; std::shared_ptr Meter_; - std::unique_ptr> OperationsTotal_; - std::unique_ptr> RetryAttemptsTotal_; + std::unique_ptr> OperationsTotal_; + std::unique_ptr> RetryAttemptsTotal_; std::unique_ptr> LatencyP50_; std::unique_ptr> LatencyP95_; std::unique_ptr> LatencyP99_; From cbe3597d10ed9ac7f2b7af9229dd4ed7d1602f3f Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 17:27:27 +0300 Subject: [PATCH 04/11] tests/slo_workloads: make apt tolerant of PPA timeouts Shared CI runners (and local docker builds) periodically hit connection timeouts on ppa.launchpadcontent.net. Telling apt itself to retry via Acquire::Retries=5 plus a 60 s connect timeout handles the blip in-place without a shell retry loop. --- tests/slo_workloads/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 65d9c7693b..ca9d77d5fe 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -2,6 +2,12 @@ FROM ubuntu:22.04 ARG PRESET=release-test-clang +# Make apt tolerant of transient PPA/mirror timeouts (shared runners see +# ppa.launchpadcontent.net connection timeouts every few builds). +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \ + echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries + # Install software-properties-common for add-apt-repository RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test From 1799ac35f1c15a1ff84368c04beee6022a328a1c Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 18:06:21 +0300 Subject: [PATCH 05/11] ci(slo): make 30-min build resilient to network flakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two separate issues were making every SLO PR a coin flip: 1. Any transient network error (GitHub release download, PPA timeout, abseil/protobuf/grpc tarball) kills the entire 30-min build. Each of the 8 wget calls and both apt-get steps is now retried: - wget gets --tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504 via a shared WGET_OPTS env var. - apt already has Acquire::Retries=5. 2. Every CI run does a full cold build. The Dockerfile's toolchain and dep layers (~25 min) never change between SDK commits, so caching them is trivially safe. - Switch slo.yml to docker/build-push-action@v6 with GHA type=gha cache export/import, scoped per preset. - First build on a runner still takes ~30 min; subsequent builds where only the SDK source changed should take ~3 min. - Baseline build uses continue-on-error + an explicit fallback step that retags ydb-app-current as ydb-app-baseline when the historical commit won't compile — replaces the inline shell if-else that docker/build-push-action can't express. --- .github/workflows/slo.yml | 78 ++++++++++++++++++++-------------- tests/slo_workloads/Dockerfile | 25 ++++++----- 2 files changed, 59 insertions(+), 44 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 4cdf4bfec8..0982b7125c 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -89,44 +89,56 @@ jobs: fetch-depth: 1 submodules: true - - name: Build current workload image - working-directory: sdk-current - run: | - set -euxo pipefail - cp tests/slo_workloads/.dockerignore .dockerignore - docker build \ - --platform linux/amd64 \ - --build-arg PRESET=${{ matrix.sdk.preset }} \ - -t ydb-app-current \ - -f tests/slo_workloads/Dockerfile \ - . - rm -f .dockerignore + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - - name: Build baseline workload image - working-directory: sdk-baseline + # .dockerignore lives under tests/slo_workloads/; buildx expects it at + # the context root, so place a copy in each SDK checkout. + - name: Stage .dockerignore run: | set -euxo pipefail - # If the historical commit lacks the SLO workload files or can't - # compile, fall back to the current image so the SLO run is still - # comparable against itself rather than silently failing. - if [ ! -f tests/slo_workloads/Dockerfile ]; then - echo "Baseline commit has no SLO Dockerfile; reusing current image" - docker tag ydb-app-current ydb-app-baseline - exit 0 + cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore + if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then + cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore fi - cp tests/slo_workloads/.dockerignore .dockerignore - if ! docker build \ - --platform linux/amd64 \ - --build-arg PRESET=${{ matrix.sdk.preset }} \ - -t ydb-app-baseline \ - -f tests/slo_workloads/Dockerfile \ - . - then - echo "Baseline build failed; reusing current image" - docker tag ydb-app-current ydb-app-baseline - fi - rm -f .dockerignore + # A clean build of the SLO image takes ~30 min because the Dockerfile + # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source. + # The GHA cache lets subsequent runs reuse every layer up to the SDK + # source COPY, so only the actual workload link step reruns (~3 min). + - name: Build current workload image + uses: docker/build-push-action@v6 + with: + context: sdk-current + file: sdk-current/tests/slo_workloads/Dockerfile + platforms: linux/amd64 + tags: ydb-app-current + load: true + build-args: PRESET=${{ matrix.sdk.preset }} + cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }} + cache-to: type=gha,mode=max,scope=slo-${{ matrix.sdk.preset }} + + - name: Build baseline workload image + id: baseline-build + continue-on-error: true + uses: docker/build-push-action@v6 + with: + context: sdk-baseline + file: sdk-baseline/tests/slo_workloads/Dockerfile + platforms: linux/amd64 + tags: ydb-app-baseline + load: true + build-args: PRESET=${{ matrix.sdk.preset }} + cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }} + + # If the historical commit lacks the SLO Dockerfile or can't compile, + # reuse the current image so the SLO run is still comparable against + # itself rather than failing outright. + - name: Fall back to current image for baseline + if: steps.baseline-build.outcome == 'failure' + run: | + echo "Baseline build failed; reusing current image as baseline." + docker tag ydb-app-current ydb-app-baseline - name: Run SLO Tests uses: ydb-platform/ydb-slo-action/init@v2 diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index ca9d77d5fe..7f8ea5e78e 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -2,12 +2,15 @@ FROM ubuntu:22.04 ARG PRESET=release-test-clang -# Make apt tolerant of transient PPA/mirror timeouts (shared runners see -# ppa.launchpadcontent.net connection timeouts every few builds). +# Every RUN that hits the network retries on transient failures so one +# flake doesn't throw away 30 min of previous build work. apt gets five +# Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \ echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries +ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504" + # Install software-properties-common for add-apt-repository RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test @@ -20,7 +23,7 @@ RUN apt-get -y update && apt-get -y install \ # Install CMake ENV CMAKE_VERSION=3.27.7 -RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \ +RUN wget $WGET_OPTS https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \ -q -O cmake-install.sh \ && chmod u+x cmake-install.sh \ && ./cmake-install.sh --skip-license --prefix=/usr/local \ @@ -28,7 +31,7 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cm # Install LLVM ENV LLVM_VERSION=16 -RUN wget https://apt.llvm.org/llvm.sh && \ +RUN wget $WGET_OPTS https://apt.llvm.org/llvm.sh && \ chmod u+x llvm.sh && \ ./llvm.sh ${LLVM_VERSION} && \ rm llvm.sh @@ -45,7 +48,7 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 10000 && \ # Install abseil-cpp ENV ABSEIL_CPP_VERSION=20230802.0 ENV ABSEIL_CPP_INSTALL_DIR=/root/ydb_deps/absl -RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \ tar -xvzf abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz && cd abseil-cpp-${ABSEIL_CPP_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DABSL_PROPAGATE_CXX_STD=ON .. && \ @@ -56,7 +59,7 @@ RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/ab # Install protobuf ENV PROTOBUF_VERSION=3.21.12 ENV PROTOBUF_INSTALL_DIR=/root/ydb_deps/protobuf -RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \ tar -xvzf protobuf-${PROTOBUF_VERSION}.tar.gz && cd protobuf-${PROTOBUF_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_INSTALL=ON -Dprotobuf_ABSL_PROVIDER=package .. && \ @@ -67,7 +70,7 @@ RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffe # Install grpc ENV GRPC_VERSION=1.54.3 ENV GRPC_INSTALL_DIR=/root/ydb_deps/grpc -RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \ tar -xvzf grpc-${GRPC_VERSION}.tar.gz && cd grpc-${GRPC_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_PREFIX_PATH="${ABSEIL_CPP_INSTALL_DIR};${PROTOBUF_INSTALL_DIR}" \ @@ -84,7 +87,7 @@ RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/ref # Install base64 ENV BASE64_VERSION=0.5.2 ENV BASE64_INSTALL_DIR=/root/ydb_deps/base64 -RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \ tar -xvzf base64-${BASE64_VERSION}.tar.gz && cd base64-${BASE64_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \ @@ -95,7 +98,7 @@ RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/arc # Install brotli ENV BROTLI_VERSION=1.1.0 ENV BROTLI_INSTALL_DIR=/root/ydb_deps/brotli -RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \ tar -xvzf brotli-${BROTLI_VERSION}.tar.gz && cd brotli-${BROTLI_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \ @@ -106,7 +109,7 @@ RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/arc # Install jwt-cpp ENV JWT_CPP_VERSION=0.7.0 ENV JWT_CPP_INSTALL_DIR=/root/ydb_deps/jwt-cpp -RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \ tar -xvzf jwt-cpp-${JWT_CPP_VERSION}.tar.gz && cd jwt-cpp-${JWT_CPP_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \ @@ -116,7 +119,7 @@ RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt- # Install ccache 4.8.1 or above ENV CCACHE_VERSION=4.8.1 -RUN wget https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \ +RUN wget $WGET_OPTS https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \ && tar -xf ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \ && cp ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/local/bin/ \ && rm -rf ccache-${CCACHE_VERSION}-linux-x86_64 ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz From da782d4c559470ede50c9857bbdb045d9bf4bb4f Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 18:07:02 +0300 Subject: [PATCH 06/11] tests/slo_workloads: retry PPA/apt steps at the shell level The earlier Acquire::Retries=5 tweak covered HTTP-level errors but didn't handle TCP connect timeouts to ppa.launchpadcontent.net, which is the actual failure mode observed on shared CI runners. Wrap both the add-apt-repository step and the main apt-get install in shell retry loops (5 attempts, 15/30/45/60/75 s backoff) so a CDN blip no longer kills the 30-min build. --- tests/slo_workloads/Dockerfile | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 7f8ea5e78e..7680560cac 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -11,15 +11,34 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504" -# Install software-properties-common for add-apt-repository -RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test +# Install software-properties-common and add the gcc-13 PPA. +# Acquire::Retries only retries HTTP errors; TCP connect timeouts to +# ppa.launchpadcontent.net still drop through and kill the step. Wrap the +# whole command in a shell retry loop with exponential backoff so a CDN +# blip doesn't throw away 30 minutes of downstream build work. +RUN for i in 1 2 3 4 5; do \ + apt-get -y update && \ + apt-get -y install software-properties-common && \ + add-apt-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -y update && \ + break; \ + echo "add-apt-repository attempt $i failed; sleeping $((i * 15))s"; \ + sleep $((i * 15)); \ + done && \ + apt-cache show gcc-13 > /dev/null # fail fast if PPA never came up # Install C++ tools and libraries -RUN apt-get -y update && apt-get -y install \ - git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \ - rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \ - libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN for i in 1 2 3 4 5; do \ + apt-get -y install \ + git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \ + rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \ + libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 && \ + break; \ + echo "apt-get install attempt $i failed; sleeping $((i * 15))s"; \ + sleep $((i * 15)); \ + apt-get -y update || true; \ + done && \ + apt-get clean && rm -rf /var/lib/apt/lists/* # Install CMake ENV CMAKE_VERSION=3.27.7 From a8da8bb6de1fcec95e257bd7fc54f8e790d7deeb Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 20:07:39 +0300 Subject: [PATCH 07/11] tests/slo_workloads: forward run-phase args in implicit All mode The v2 SLO action invokes the workload as `slo-key-value ` without a subcommand keyword. The global parser used to error on unknown long options like --read-rps; now it tolerates them and the implicit All branch passes the leftover argv to the run phase. --- tests/slo_workloads/utils/utils.cpp | 33 +++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index 78231c8e1c..89a642b61d 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -161,6 +161,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean opts.SetFreeArgsMin(0); opts.SetFreeArgTitle(0, "", GetCmdList()); opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER; + // Run-phase options (--read-rps, --write-rps, …) reach DoMain when the + // caller invokes the workload without an explicit subcommand (the v2 SLO + // action contract). Tolerate them here so the global parser stops at the + // first unknown option instead of erroring; they are forwarded to the + // run phase below. + opts.AllowUnknownLongOptions_ = true; TOptsParseResult res(&opts, argc, argv); size_t freeArgsPos = res.GetFreeArgsPos(); @@ -169,8 +175,14 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All; if (command == ECommandType::Unknown) { - Cerr << "Unknown command '" << *argv << "'" << Endl; - return EXIT_FAILURE; + if (argv[0][0] == '-') { + // First leftover token is an option, not a subcommand keyword: + // treat as implicit All mode and let the run phase parse it. + command = ECommandType::All; + } else { + Cerr << "Unknown command '" << *argv << "'" << Endl; + return EXIT_FAILURE; + } } if (prefix.empty()) { @@ -227,18 +239,25 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean break; case ECommandType::All: { Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl; - // Synthesize argv with a fake program name so the inner NLastGetopt - // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0] - // as the program name and parse zero real args. + // Forward leftover argv to the run phase so options like + // --read-rps / --write-rps take effect. argv[0] here is the first + // run-phase option (no subcommand keyword was supplied), so + // prepend a synthetic program name for ParseOptionsRun. char programName[] = "slo"; - char* fakeArgv[] = { programName, nullptr }; + std::vector runArgv; + runArgv.reserve(argc + 1); + runArgv.push_back(programName); + for (int i = 0; i < argc; ++i) { + runArgv.push_back(argv[i]); + } int fakeArgc = 1; + char* fakeArgv[] = { programName, nullptr }; Cout << "[all] Launching create command..." << Endl; result = create(dbOptions, fakeArgc, fakeArgv); if (!result) { Cout << "[all] Launching run command..." << Endl; - result = run(dbOptions, fakeArgc, fakeArgv); + result = run(dbOptions, static_cast(runArgv.size()), runArgv.data()); } Cout << "[all] Launching cleanup command..." << Endl; int cleanupRc = cleanup(dbOptions, fakeArgc); From 4d27c9f612284ccd7a5e611c70b069ea2980b29f Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 00:20:53 +0300 Subject: [PATCH 08/11] ci(slo): overlay current workload harness onto baseline checkout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The baseline image used to be built from the merge-base checkout's tests/slo_workloads/, so any harness change on the PR (Dockerfile, CLI parser, …) was absent from the baseline. SDK comparison should only vary the library, not the harness — so reuse current's harness on both sides. --- .github/workflows/slo.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 0982b7125c..63145bba8f 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -92,15 +92,19 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - # .dockerignore lives under tests/slo_workloads/; buildx expects it at - # the context root, so place a copy in each SDK checkout. - - name: Stage .dockerignore + # Use current's workload harness (Dockerfile, sources, .dockerignore) for + # both builds so only the SDK library differs between current and + # baseline. Without this the baseline image picks up the harness from + # the merge-base commit, which can lag behind the action's contract. + # buildx also expects .dockerignore at the context root, not under + # tests/, so copy it up in each checkout. + - name: Stage workload harness run: | set -euxo pipefail + rm -rf sdk-baseline/tests/slo_workloads + cp -a sdk-current/tests/slo_workloads sdk-baseline/tests/slo_workloads cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore - if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then - cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore - fi + cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore # A clean build of the SLO image takes ~30 min because the Dockerfile # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source. From a6caf4ef2a74dc198b9c36051445037f89dcefbe Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 00:27:27 +0300 Subject: [PATCH 09/11] ci(slo): wire ccache into the cmake build via BuildKit cache mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cmake configure/build now run under ccache as the C/C++ compiler launcher, with /root/.ccache exposed as a BuildKit cache mount so state persists across runs through cache-to=type=gha,mode=max (BuildKit ≥0.13 exports cache mounts under mode=max). Cold runs incur the usual full compile; warm runs reuse object hashes and should drop cmake --build from ~14 min to a few minutes. --- tests/slo_workloads/Dockerfile | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 7680560cac..f87be8f8df 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -1,7 +1,16 @@ +# syntax=docker/dockerfile:1.7 FROM ubuntu:22.04 ARG PRESET=release-test-clang +# ccache settings consumed by the configure/build steps below. The cache dir +# is materialised by the BuildKit cache mount on those RUN steps; values +# elsewhere in the image are inert. +ENV CCACHE_DIR=/root/.ccache +ENV CCACHE_MAXSIZE=2G +ENV CCACHE_COMPRESS=true +ENV CCACHE_COMPILERCHECK=content + # Every RUN that hits the network retries on transient failures so one # flake doesn't throw away 30 min of previous build work. apt gets five # Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS. @@ -147,7 +156,13 @@ COPY . /ydb-cpp-sdk WORKDIR /ydb-cpp-sdk RUN rm -rf build -RUN cmake --preset ${PRESET} -RUN cmake --build --preset default --target slo-key-value +RUN --mount=type=cache,target=/root/.ccache,sharing=locked \ + cmake --preset ${PRESET} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache +RUN --mount=type=cache,target=/root/.ccache,sharing=locked \ + ccache --zero-stats >/dev/null \ + && cmake --build --preset default --target slo-key-value \ + && ccache --show-stats ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"] From 09e662f97d5c2dd8c8a0b5b9e4c78fe745c2141e Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 03:17:03 +0300 Subject: [PATCH 10/11] tests/slo_workloads: read prefix from YDB_DATABASE when connection-string lacks ?database= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v2 SLO action provides YDB_CONNECTION_STRING in path form (grpc://host:port/Root/testdb), which GetDatabase can't parse, so prefix stayed empty and create issued CreateTable("key_value") without the database root → BAD_REQUEST. Falling back to the YDB_DATABASE env var (which the action also sets) restores the correct table path. --- tests/slo_workloads/utils/utils.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index 89a642b61d..f52461283b 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -188,6 +188,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean if (prefix.empty()) { prefix = GetDatabase(connectionString); } + if (prefix.empty()) { + // YDB SLO action sets YDB_CONNECTION_STRING in path form + // (grpc://host:port/Root/testdb), which GetDatabase can't parse. + // Fall back to YDB_DATABASE which the action sets alongside it. + prefix = GetEnv("YDB_DATABASE"); + } if (!ParseToken(token, tokenFile)) { return EXIT_FAILURE; From a581162623f4982bcf4e81a7321328d5146431f6 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 03:23:57 +0300 Subject: [PATCH 11/11] ci(slo): persist ccache between runs via actions/cache + buildkit-cache-dance cache-to: type=gha,mode=max exports layer cache but not the contents of --mount=type=cache, so the ccache mount started empty on every run (0% hit rate observed). Restore /root/.ccache from a host dir backed by actions/cache, inject it into BuildKit before each build via the cache-dance action, and extract the updated state for the next run. Same scope is shared by current and baseline since the SDK code overlap dominates ccache hit potential. --- .github/workflows/slo.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 63145bba8f..db9e4d9841 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -106,6 +106,33 @@ jobs: cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore + # `cache-to: type=gha` does NOT export `--mount=type=cache` content, so + # ccache state is lost between runs. Persist /root/.ccache via host + # directory + cache-dance: actions/cache restores the host dir, the + # dance injects it into the BuildKit cache mount before the build and + # extracts the updated state afterwards for the next save. + - name: Restore ccache + id: ccache + uses: actions/cache@v4 + with: + path: ccache + key: slo-ccache-${{ matrix.sdk.preset }}-${{ github.run_id }} + restore-keys: | + slo-ccache-${{ matrix.sdk.preset }}- + + - name: Inject ccache into BuildKit + uses: reproducible-containers/buildkit-cache-dance@v3.1.2 + with: + cache-map: | + { + "ccache": "/root/.ccache" + } + # Always extract so newly-compiled TUs from this run are saved by + # actions/cache (key uses ${{ github.run_id }}, so each run gets + # its own snapshot). Without extraction the cache stays frozen at + # whatever was first persisted. + skip-extraction: false + # A clean build of the SLO image takes ~30 min because the Dockerfile # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source. # The GHA cache lets subsequent runs reuse every layer up to the SDK