From 65fc7c283472f321312ee20eb6021f5f69bc6f20 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 16:40:35 +0300
Subject: [PATCH 01/11] tests/slo_workloads: align with ydb-slo-action v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v2 SLO action (ydb-platform/ydb-slo-action) starts a workload
container once per run, passes config via env vars (WORKLOAD_REF,
WORKLOAD_DURATION, YDB_CONNECTION_STRING, ...), and queries
sdk_operation_latency_p{50,95,99}_seconds as pre-computed gauges
labeled by (operation_type, operation_status, ref).

Changes:

- Dockerfile: drop SLO_BRANCH_REF compile-time ref baking; a single
  ENTRYPOINT binary without CMD is now sufficient.
- utils/utils.cpp: subcommand is optional — with no free-arg the
  binary runs create -> run -> cleanup sequentially (cleanup always
  runs, even if run errored). Option resolution follows the standard
  CLI > env > default priority via .DefaultValue():
  - -c from YDB_CONNECTION_STRING (or built from YDB_ENDPOINT +
    YDB_DATABASE);
  - --metrics-push-url from OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
  - --time from WORKLOAD_DURATION.
- utils/metrics.cpp: replace the OTel Histogram with HDR-backed
  gauges. Only successful operations are recorded; a background
  thread snapshots p50/p95/p99 every second, publishes the gauges
  with operation_status="success", then resets the HDR window.
  sdk_retry_attempts_total becomes a counter of (retry_attempts + 1)
  per op. ref is read from WORKLOAD_REF at startup, not compile time.
- utils/CMakeLists.txt: pull HdrHistogram_c 0.11.8 via FetchContent
  (opt-in via the outer YDB_SDK_TESTS flag) and link slo-utils
  against hdr_histogram_static.
---
 tests/slo_workloads/Dockerfile           |   3 +-
 tests/slo_workloads/utils/CMakeLists.txt |  19 +-
 tests/slo_workloads/utils/metrics.cpp    | 239 ++++++++++++++---------
 tests/slo_workloads/utils/utils.cpp      |  70 ++++++-
 tests/slo_workloads/utils/utils.h        |   3 +-
 5 files changed, 237 insertions(+), 97 deletions(-)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 091ce23066..65d9c7693b 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -1,7 +1,6 @@
 FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
-ARG REF=unknown
 
 # Install software-properties-common for add-apt-repository
 RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
@@ -120,7 +119,7 @@ COPY . /ydb-cpp-sdk
 WORKDIR /ydb-cpp-sdk
 RUN rm -rf build
 
-RUN cmake -DSLO_BRANCH_REF=${REF} --preset ${PRESET}
+RUN cmake --preset ${PRESET}
 RUN cmake --build --preset default --target slo-key-value
 
 ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"]
diff --git a/tests/slo_workloads/utils/CMakeLists.txt b/tests/slo_workloads/utils/CMakeLists.txt
index e8589a568f..5434af9bb4 100644
--- a/tests/slo_workloads/utils/CMakeLists.txt
+++ b/tests/slo_workloads/utils/CMakeLists.txt
@@ -1,3 +1,16 @@
+include(FetchContent)
+
+FetchContent_Declare(
+  hdr_histogram
+  GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git
+  GIT_TAG        0.11.8
+  EXCLUDE_FROM_ALL
+)
+set(HDR_HISTOGRAM_BUILD_PROGRAMS OFF CACHE BOOL "" FORCE)
+set(HDR_HISTOGRAM_BUILD_SHARED   OFF CACHE BOOL "" FORCE)
+set(HDR_LOG_REQUIRED             OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(hdr_histogram)
+
 add_library(slo-utils)
 
 target_link_libraries(slo-utils PUBLIC
@@ -9,9 +22,9 @@ target_link_libraries(slo-utils PUBLIC
   opentelemetry-cpp::otlp_http_metric_exporter
 )
 
-if (SLO_BRANCH_REF)
-  target_compile_definitions(slo-utils PRIVATE REF=${SLO_BRANCH_REF})
-endif()
+target_link_libraries(slo-utils PRIVATE
+  hdr_histogram_static
+)
 
 target_sources(slo-utils PRIVATE
   executor.cpp
diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 50e1f859c0..996a48780f 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -11,21 +11,93 @@
 
 #include <ydb-cpp-sdk/client/resources/ydb_resources.h>
 
+#include <util/system/env.h>
+
+#include <hdr/hdr_histogram.h>
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <thread>
+
 
 using namespace std::chrono_literals;
 
-#ifdef REF
-static constexpr const std::string_view REF_LABEL = Y_STRINGIZE(REF);
-#else
-static constexpr const std::string_view REF_LABEL = "unknown";
-#endif
+namespace {
+
+constexpr std::int64_t kHdrMinLatencyNs = 1'000;          // 1 us
+constexpr std::int64_t kHdrMaxLatencyNs = 60'000'000'000; // 60 s
+constexpr int kHdrSignificantFigures = 3;
+
+std::string ResolveWorkloadRef() {
+    std::string ref = GetEnv("WORKLOAD_REF");
+    return ref.empty() ? "unknown" : ref;
+}
+
+// Minimal thread-safe wrapper around hdr_histogram for a single
+// (operation_type, operation_status="success") series. Only successful
+// latencies are recorded; errors are excluded from the percentile stream
+// per deploy/metrics.yaml.
+class TLatencyRecorder {
+public:
+    TLatencyRecorder() {
+        hdr_histogram* raw = nullptr;
+        int rc = hdr_init(kHdrMinLatencyNs, kHdrMaxLatencyNs, kHdrSignificantFigures, &raw);
+        Y_ABORT_UNLESS(rc == 0, "hdr_init failed: %d", rc);
+        Histogram_.reset(raw);
+    }
+
+    void Record(TDuration d) {
+        std::int64_t ns = static_cast<std::int64_t>(d.NanoSeconds());
+        if (ns < kHdrMinLatencyNs) {
+            ns = kHdrMinLatencyNs;
+        } else if (ns > kHdrMaxLatencyNs) {
+            ns = kHdrMaxLatencyNs;
+        }
+        std::lock_guard lock(Mutex_);
+        hdr_record_value(Histogram_.get(), ns);
+    }
+
+    // Returns p50/p95/p99 as seconds and resets the recorder window so
+    // gauges reflect only the most recent interval.
+    struct TPercentiles {
+        double P50 = 0.0;
+        double P95 = 0.0;
+        double P99 = 0.0;
+        bool HasData = false;
+    };
+
+    TPercentiles SnapshotAndReset() {
+        TPercentiles out;
+        std::lock_guard lock(Mutex_);
+        if (Histogram_->total_count == 0) {
+            return out;
+        }
+        out.HasData = true;
+        out.P50 = hdr_value_at_percentile(Histogram_.get(), 50.0) / 1e9;
+        out.P95 = hdr_value_at_percentile(Histogram_.get(), 95.0) / 1e9;
+        out.P99 = hdr_value_at_percentile(Histogram_.get(), 99.0) / 1e9;
+        hdr_reset(Histogram_.get());
+        return out;
+    }
+
+private:
+    struct THdrDeleter {
+        void operator()(hdr_histogram* h) const noexcept { if (h) hdr_close(h); }
+    };
+
+    std::mutex Mutex_;
+    std::unique_ptr<hdr_histogram, THdrDeleter> Histogram_;
+};
 
 class TOtelMetricsPusher : public IMetricsPusher {
 public:
     TOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType)
         : OperationType_(operationType)
+        , Ref_(ResolveWorkloadRef())
         , CommonAttributes_{
-            {"ref", std::string(REF_LABEL)},
+            {"ref", Ref_},
             {"sdk", "cpp"},
             {"sdk_version", NYdb::GetSdkSemver()}
         }
@@ -36,12 +108,11 @@ class TOtelMetricsPusher : public IMetricsPusher {
         auto exporter = opentelemetry::exporter::otlp::OtlpHttpMetricExporterFactory::Create(exporterOptions);
 
         opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions readerOptions;
-        readerOptions.export_interval_millis = 250ms;
-        readerOptions.export_timeout_millis  = 200ms;
+        readerOptions.export_interval_millis = 1000ms;
+        readerOptions.export_timeout_millis  = 900ms;
 
         auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions);
 
-        // Create MeterContext with resource
         auto context = std::make_unique<opentelemetry::sdk::metrics::MeterContext>(
             std::unique_ptr<opentelemetry::sdk::metrics::ViewRegistry>(new opentelemetry::sdk::metrics::ViewRegistry()),
             opentelemetry::sdk::resource::Resource::Create(opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_))
@@ -53,97 +124,85 @@ class TOtelMetricsPusher : public IMetricsPusher {
         Meter_ = MeterProvider_->GetMeter("slo_workloads", NYdb::GetSdkSemver());
 
         InitMetrics();
+        StartPercentilePublisher();
+    }
+
+    ~TOtelMetricsPusher() override {
+        PublisherShouldStop_.store(true);
+        if (PublisherThread_.joinable()) {
+            PublisherThread_.join();
+        }
     }
 
     void PushRequestData(const TRequestData& requestData) override {
-        if (requestData.Status == NYdb::EStatus::SUCCESS) {
-            OperationsSuccessTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}}));
-        } else {
-            ErrorsTotal_->Add(1, MergeAttributes({{"status", YdbStatusToString(requestData.Status)}}));
-            OperationsFailureTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}}));
+        const bool success = requestData.Status == NYdb::EStatus::SUCCESS;
+        const std::string status = success ? "success" : "error";
+
+        OperationsTotal_->Add(1, MergeAttributes({
+            {"operation_type", OperationType_},
+            {"operation_status", status},
+        }));
+
+        // sdk_retry_attempts_total = total number of technical attempts
+        // including the first one. TStatUnit counts only post-first attempts,
+        // so add 1 to include the initial attempt.
+        RetryAttemptsTotal_->Add(static_cast<double>(requestData.RetryAttempts + 1),
+            MergeAttributes({
+                {"operation_type", OperationType_},
+            })
+        );
+
+        if (success) {
+            Latency_.Record(requestData.Delay);
         }
-        OperationsTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}}));
-        OperationLatencySeconds_->Record(requestData.Delay.SecondsFloat(), MergeAttributes({{"operation_type", OperationType_}, {"status", YdbStatusToString(requestData.Status)}}));
-        RetryAttempts_->Record(requestData.RetryAttempts, MergeAttributes({{"operation_type", OperationType_}}));
     }
 
 private:
     void InitMetrics() {
-        ErrorsTotal_ = Meter_->CreateUInt64Counter("sdk_errors_total",
-            "Total number of errors encountered, categorized by error type."
+        OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total",
+            "Total number of operations, categorized by operation type and status."
         );
 
-        OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total",
-            "Total number of operations, categorized by type attempted by the SDK."
-        );
-    
-        OperationsSuccessTotal_ = Meter_->CreateUInt64Counter("sdk_operations_success_total",
-            "Total number of successful operations, categorized by type."
+        RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total",
+            "Total number of retry attempts (including the first attempt), categorized by operation type."
         );
 
-        OperationsFailureTotal_ = Meter_->CreateUInt64Counter("sdk_operations_failure_total",
-            "Total number of failed operations, categorized by type."
+        LatencyP50_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p50_seconds",
+            "P50 latency of successful operations in seconds.", "s"
         );
-
-        OperationLatencySeconds_ = CreateDoubleHistogram("sdk_operation_latency_seconds",
-            "Latency of operations performed by the SDK in seconds, categorized by type and status.",
-            {
-				0.001,  // 1 ms
-				0.002,  // 2 ms
-				0.003,  // 3 ms
-				0.004,  // 4 ms
-				0.005,  // 5 ms
-				0.0075, // 7.5 ms
-				0.010,  // 10 ms
-				0.020,  // 20 ms
-				0.050,  // 50 ms
-				0.100,  // 100 ms
-				0.200,  // 200 ms
-				0.500,  // 500 ms
-				1.000,  // 1 s
-			},
-            "s"
+        LatencyP95_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p95_seconds",
+            "P95 latency of successful operations in seconds.", "s"
         );
-
-        RetryAttempts_ = Meter_->CreateInt64Gauge("sdk_retry_attempts",
-            "Current retry attempts, categorized by operation type."
+        LatencyP99_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p99_seconds",
+            "P99 latency of successful operations in seconds.", "s"
         );
     }
 
-    std::unique_ptr<opentelemetry::metrics::Histogram<double>> CreateDoubleHistogram(
-        const std::string& name,
-        const std::string& description,
-        const std::vector<double>& buckets,
-        const std::string& unit = {})
-    {
-        auto selector = std::make_unique<opentelemetry::sdk::metrics::InstrumentSelector>(
-            opentelemetry::sdk::metrics::InstrumentType::kHistogram,
-            name,
-            unit
-        );
-
-        auto meterSelector = std::make_unique<opentelemetry::sdk::metrics::MeterSelector>(
-            "slo_workloads",
-            NYdb::GetSdkSemver(),
-            ""
-        );
-
-        auto histogramConfig = std::make_shared<opentelemetry::sdk::metrics::HistogramAggregationConfig>();
-        histogramConfig->boundaries_ = buckets;
-
-        auto view = std::make_unique<opentelemetry::sdk::metrics::View>(
-            "",
-            "",
-            opentelemetry::sdk::metrics::AggregationType::kHistogram,
-            histogramConfig
-        );
-
-        MeterProvider_->AddView(std::move(selector), std::move(meterSelector), std::move(view));
+    void StartPercentilePublisher() {
+        PublisherThread_ = std::thread([this]() {
+            while (!PublisherShouldStop_.load(std::memory_order_relaxed)) {
+                std::this_thread::sleep_for(1s);
+                PublishPercentiles();
+            }
+            // Final flush before exit.
+            PublishPercentiles();
+        });
+    }
 
-        return Meter_->CreateDoubleHistogram(name, description, unit);
+    void PublishPercentiles() {
+        auto snapshot = Latency_.SnapshotAndReset();
+        if (!snapshot.HasData) {
+            return;
+        }
+        auto attrs = MergeAttributes({
+            {"operation_type", OperationType_},
+            {"operation_status", "success"},
+        });
+        LatencyP50_->Record(snapshot.P50, attrs);
+        LatencyP95_->Record(snapshot.P95, attrs);
+        LatencyP99_->Record(snapshot.P99, attrs);
     }
 
-    // Helper to merge common attributes with metric-specific ones
     std::map<std::string, std::string> MergeAttributes(const std::map<std::string, std::string>& metricAttrs) const {
         std::map<std::string, std::string> result = CommonAttributes_;
         result.insert(metricAttrs.begin(), metricAttrs.end());
@@ -151,17 +210,21 @@ class TOtelMetricsPusher : public IMetricsPusher {
     }
 
     std::string OperationType_;
-    std::map<std::string, std::string> CommonAttributes_;  // ref, sdk, sdk_version
+    std::string Ref_;
+    std::map<std::string, std::string> CommonAttributes_;
 
     std::unique_ptr<opentelemetry::sdk::metrics::MeterProvider> MeterProvider_;
     std::shared_ptr<opentelemetry::metrics::Meter> Meter_;
 
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> ErrorsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsSuccessTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsFailureTotal_;
-    std::unique_ptr<opentelemetry::metrics::Histogram<double>> OperationLatencySeconds_;
-    std::unique_ptr<opentelemetry::metrics::Gauge<int64_t>> RetryAttempts_;
+    std::unique_ptr<opentelemetry::metrics::Counter<double>> OperationsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Counter<double>> RetryAttemptsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP50_;
+    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP95_;
+    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP99_;
+
+    TLatencyRecorder Latency_;
+    std::thread PublisherThread_;
+    std::atomic<bool> PublisherShouldStop_{false};
 };
 
 class TNoopMetricsPusher : public IMetricsPusher {
@@ -169,6 +232,8 @@ class TNoopMetricsPusher : public IMetricsPusher {
     void PushRequestData([[maybe_unused]] const TRequestData& requestData) override {}
 };
 
+} // namespace
+
 std::unique_ptr<IMetricsPusher> CreateOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) {
     return std::make_unique<TOtelMetricsPusher>(metricsPushUrl, operationType);
 }
diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index b7572c2981..78231c8e1c 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -7,6 +7,8 @@
 #include <util/folder/path.h>
 #include <util/folder/dirut.h>
 #include <util/stream/file.h>
+#include <util/string/builder.h>
+#include <util/string/cast.h>
 #include <util/string/strip.h>
 #include <util/system/env.h>
 #include <util/random/random.h>
@@ -110,6 +112,19 @@ std::string GetDatabase(const std::string& connectionString) {
     return {};
 }
 
+static std::string DefaultConnectionStringFromEnv() {
+    std::string cs = GetEnv("YDB_CONNECTION_STRING");
+    if (!cs.empty()) {
+        return cs;
+    }
+    std::string endpoint = GetEnv("YDB_ENDPOINT");
+    std::string database = GetEnv("YDB_DATABASE");
+    if (!endpoint.empty() && !database.empty()) {
+        return TStringBuilder() << endpoint << "/?database=" << database;
+    }
+    return {};
+}
+
 int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TCleanupCommand cleanup) {
     TOpts opts = TOpts::Default();
 
@@ -121,8 +136,15 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     std::string statConfigFile;
     std::string balancingPolicy;
 
-    opts.AddLongOption('c', "connection-string", "YDB connection string").Required().RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE")
+    std::string defaultConnectionString = DefaultConnectionStringFromEnv();
+
+    auto& connOpt = opts.AddLongOption('c', "connection-string", "YDB connection string").RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE")
         .StoreResult(&connectionString);
+    if (!defaultConnectionString.empty()) {
+        connOpt.DefaultValue(defaultConnectionString);
+    } else {
+        connOpt.Required();
+    }
     opts.AddLongOption('p', "prefix", "Base prefix for tables").RequiredArgument("PATH")
         .StoreResult(&prefix);
     opts.AddLongOption('k', "token", "security token").RequiredArgument("TOKEN")
@@ -136,7 +158,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     opts.AddLongOption('b', "balancing-policy", "Balancing policy").Optional().DefaultValue("use-all-nodes").RequiredArgument("(use-all-nodes|prefer-local-dc|prefer-primary-pile)")
         .StoreResult(&balancingPolicy);
     opts.AddHelpOption('h');
-    opts.SetFreeArgsMin(1);
+    opts.SetFreeArgsMin(0);
     opts.SetFreeArgTitle(0, "<COMMAND>", GetCmdList());
     opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER;
 
@@ -144,7 +166,8 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     size_t freeArgsPos = res.GetFreeArgsPos();
     argc -= freeArgsPos;
     argv += freeArgsPos;
-    ECommandType command = ParseCommand(*argv);
+
+    ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All;
     if (command == ECommandType::Unknown) {
         Cerr << "Unknown command '" << *argv << "'" << Endl;
         return EXIT_FAILURE;
@@ -202,6 +225,28 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             Cout << "Launching cleanup command..." << Endl;
             result = cleanup(dbOptions, argc);
             break;
+        case ECommandType::All: {
+            Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl;
+            // Synthesize argv with a fake program name so the inner NLastGetopt
+            // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0]
+            // as the program name and parse zero real args.
+            char programName[] = "slo";
+            char* fakeArgv[] = { programName, nullptr };
+            int fakeArgc = 1;
+
+            Cout << "[all] Launching create command..." << Endl;
+            result = create(dbOptions, fakeArgc, fakeArgv);
+            if (!result) {
+                Cout << "[all] Launching run command..." << Endl;
+                result = run(dbOptions, fakeArgc, fakeArgv);
+            }
+            Cout << "[all] Launching cleanup command..." << Endl;
+            int cleanupRc = cleanup(dbOptions, fakeArgc);
+            if (!result) {
+                result = cleanupRc;
+            }
+            break;
+        }
         default:
             Cerr << "Unknown command" << Endl;
             return EXIT_FAILURE;
@@ -216,7 +261,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
 }
 
 std::string GetCmdList() {
-    return "create, run, cleanup";
+    return "create, run, cleanup (omit to run create -> run -> cleanup in one process)";
 }
 
 ECommandType ParseCommand(const char* cmd) {
@@ -425,6 +470,11 @@ TTableStats GetTableStats(TDatabaseOptions& dbOptions, const std::string& tableN
 }
 
 void ParseOptionsCommon(TOpts& opts, TCommonOptions& options) {
+    std::string metricsPushUrlFromEnv = GetEnv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT");
+    if (!metricsPushUrlFromEnv.empty()) {
+        options.MetricsPushUrl = metricsPushUrlFromEnv;
+    }
+
     opts.AddLongOption("threads", "Number of threads to use").RequiredArgument("NUM")
         .DefaultValue(options.MaxInputThreads).StoreResult(&options.MaxInputThreads);
     opts.AddLongOption("stop-on-error", "Stop thread if an error occured").NoArgument()
@@ -485,6 +535,18 @@ bool ParseOptionsCreate(int argc, char** argv, TCreateOptions& createOptions) {
 bool ParseOptionsRun(int argc, char** argv, TRunOptions& runOptions) {
     TOpts opts = TOpts::Default();
     ParseOptionsCommon(opts, runOptions.CommonOptions);
+
+    if (std::string workloadDuration = GetEnv("WORKLOAD_DURATION"); !workloadDuration.empty()) {
+        try {
+            std::uint32_t parsed = FromString<std::uint32_t>(workloadDuration);
+            if (parsed > 0) {
+                runOptions.CommonOptions.SecondsToRun = parsed;
+            }
+        } catch (const std::exception& e) {
+            Cerr << "Invalid WORKLOAD_DURATION env value '" << workloadDuration << "': " << e.what() << Endl;
+        }
+    }
+
     opts.AddLongOption("time", "Time to run (Seconds)").RequiredArgument("Seconds")
         .DefaultValue(runOptions.CommonOptions.SecondsToRun).StoreResult(&runOptions.CommonOptions.SecondsToRun);
     opts.AddLongOption("read-rps", "Request generation rate for read requests (Thread A)").RequiredArgument("NUM")
diff --git a/tests/slo_workloads/utils/utils.h b/tests/slo_workloads/utils/utils.h
index 65be9f4891..3eb3c48978 100644
--- a/tests/slo_workloads/utils/utils.h
+++ b/tests/slo_workloads/utils/utils.h
@@ -98,7 +98,8 @@ enum class ECommandType {
     Unknown,
     Create,
     Run,
-    Cleanup
+    Cleanup,
+    All,  // No free-arg passed: execute Create -> Run -> Cleanup in one process
 };
 
 struct TTableStats {

From 4d8f5385b6954e23111c1bea143367c852114a17 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 17:07:41 +0300
Subject: [PATCH 02/11] ci(slo): delegate to ydb-slo-action/init@v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrite the SLO workflows following the pattern used by ydb-java-sdk
(ydb-platform/ydb-java-sdk#644).

slo.yml:
- Drop the hand-rolled docker run orchestration that spun up YDB and
  invoked the workload with --dont-push / explicit create/run phases.
  The v2 action owns that lifecycle via deploy/compose.yml — we just
  hand it two prebuilt images.
- Gate on the `SLO` PR label.
- Build both images with a single `docker build` per ref; if the
  baseline commit can't be built (missing Dockerfile or compile error
  on a historical SHA), fall back to the current image so the run is
  comparable against itself rather than silently failing.
- Drop `--build-arg REF=…` — ref is now read from WORKLOAD_REF env at
  runtime.
- Rename matrix entry to `cpp-key-value` to match the built binary and
  collapse the per-compiler matrix to a single clang entry (gcc variant
  can be added back when needed).

slo_report.yml:
- Pin to @v2.
- Add a second job that removes the `SLO` label from the PR after the
  report is published, matching js-sdk/java-sdk/go-sdk.
---
 .github/workflows/slo.yml        | 278 ++++++++-----------------------
 .github/workflows/slo_report.yml |  25 ++-
 2 files changed, 95 insertions(+), 208 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 0618526617..4cdf4bfec8 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -2,275 +2,143 @@ name: SLO
 
 on:
   pull_request:
-    types: [opened, reopened, synchronize]
-    branches:
-      - main
-  workflow_dispatch:
-    inputs:
-      github_issue:
-        description: "GitHub issue number where the SLO results will be reported"
-        required: true
-      baseline_ref:
-        description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)"
-        required: false
-      slo_workload_duration_seconds:
-        description: "Duration of the SLO workload in seconds"
-        required: false
-        default: "600"
-      slo_workload_read_max_rps:
-        description: "Maximum read RPS for the SLO workload"
-        required: false
-        default: "1000"
-      slo_workload_write_max_rps:
-        description: "Maximum write RPS for the SLO workload"
-        required: false
-        default: "100"
+    types: [opened, reopened, synchronize, labeled]
 
 jobs:
   ydb-slo-action:
+    if: contains(github.event.pull_request.labels.*.name, 'SLO')
+
     name: Run YDB SLO Tests
     runs-on: ubuntu-latest
 
+    permissions:
+      contents: read
+
     strategy:
+      fail-fast: false
       matrix:
-        compiler: [clang, gcc]
-        include:
-          - workload: table
+        sdk:
+          - name: cpp-key-value
+            preset: release-test-clang
+            command: ""
 
     concurrency:
-      group: slo-${{ github.ref }}-${{ matrix.os }}-${{ matrix.workload }}-${{ matrix.compiler }}
+      group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
       cancel-in-progress: true
 
     steps:
       - name: Install dependencies
         run: |
+          set -euxo pipefail
           YQ_VERSION=v4.48.2
           BUILDX_VERSION=0.30.1
           COMPOSE_VERSION=2.40.3
 
-          sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \
-            sudo chmod +x /usr/local/bin/yq
+          sudo curl -fLo /usr/local/bin/yq \
+            "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64"
+          sudo chmod +x /usr/local/bin/yq
 
-          echo "Updating Docker plugins..."
           sudo mkdir -p /usr/local/lib/docker/cli-plugins
 
-          echo "Installing Docker Buildx ${BUILDX_VERSION}..."
           sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \
             "https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64"
           sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx
 
-          echo "Installing Docker Compose ${COMPOSE_VERSION}..."
           sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \
             "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64"
           sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
 
-          echo "Installed versions:"
           yq --version
           docker --version
           docker buildx version
           docker compose version
 
-      - name: Checkout current version
+      - name: Checkout current SDK version
         uses: actions/checkout@v5
         with:
-          path: current
+          path: sdk-current
           fetch-depth: 0
           submodules: true
 
       - name: Determine baseline commit
         id: baseline
+        working-directory: sdk-current
         run: |
-          cd current
-          if [[ -n "${{ inputs.baseline_ref }}" ]]; then
-            BASELINE="${{ inputs.baseline_ref }}"
-          else
-            BASELINE=$(git merge-base HEAD origin/main)
-          fi
-          echo "sha=$BASELINE" >> $GITHUB_OUTPUT
+          set -euo pipefail
+          BASELINE=$(git merge-base HEAD origin/main)
+          echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT"
 
-          # Try to determine a human-readable ref name for baseline
-          # Check if baseline is on main
-          if git merge-base --is-ancestor $BASELINE origin/main && \
-             [ "$(git rev-parse origin/main)" = "$BASELINE" ]; then
+          if git merge-base --is-ancestor "${BASELINE}" origin/main && \
+             [ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then
             BASELINE_REF="main"
           else
-            # Try to find a branch containing this commit
-            BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "")
-            if [ -n "$BRANCH" ]; then
+            BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "")
+            if [ -n "${BRANCH}" ]; then
               BASELINE_REF="${BRANCH}@${BASELINE:0:7}"
             else
               BASELINE_REF="${BASELINE:0:7}"
             fi
           fi
-          echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT
+          echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT"
 
-      - name: Checkout baseline version
+      - name: Checkout baseline SDK version
         uses: actions/checkout@v5
         with:
           ref: ${{ steps.baseline.outputs.sha }}
-          path: baseline
+          path: sdk-baseline
           fetch-depth: 1
           submodules: true
 
-      - name: Build Workload Image
+      - name: Build current workload image
+        working-directory: sdk-current
         run: |
-          echo "Cleaning up Docker system before builds..."
-          docker system prune -af --volumes
-          docker builder prune -af
-          df -h
-
-          # Build current version
-          if [ -f "$GITHUB_WORKSPACE/current/tests/slo_workloads/Dockerfile" ]; then
-            echo "Building current app image..."
-            cd "$GITHUB_WORKSPACE/current"
-
-            # Use SLO-specific .dockerignore
-            cp tests/slo_workloads/.dockerignore .dockerignore
-
-            docker build -t ydb-app-current \
-              --build-arg REF="${{ github.head_ref || github.ref_name }}" \
-              --build-arg PRESET=release-test-${{ matrix.compiler }} \
-              -f tests/slo_workloads/Dockerfile .
-
-            # Clean up .dockerignore
-            rm -f .dockerignore
-          else
-            echo "No current app Dockerfile found"
-            exit 1
+          set -euxo pipefail
+          cp tests/slo_workloads/.dockerignore .dockerignore
+          docker build \
+            --platform linux/amd64 \
+            --build-arg PRESET=${{ matrix.sdk.preset }} \
+            -t ydb-app-current \
+            -f tests/slo_workloads/Dockerfile \
+            .
+          rm -f .dockerignore
+
+      - name: Build baseline workload image
+        working-directory: sdk-baseline
+        run: |
+          set -euxo pipefail
+          # If the historical commit lacks the SLO workload files or can't
+          # compile, fall back to the current image so the SLO run is still
+          # comparable against itself rather than silently failing.
+          if [ ! -f tests/slo_workloads/Dockerfile ]; then
+            echo "Baseline commit has no SLO Dockerfile; reusing current image"
+            docker tag ydb-app-current ydb-app-baseline
+            exit 0
           fi
 
-          docker system prune -f --volumes
-          docker builder prune -af
-
-          # Build baseline version
-          if [ -f "$GITHUB_WORKSPACE/baseline/tests/slo_workloads/Dockerfile" ]; then
-            echo "Building baseline app image..."
-            cd "$GITHUB_WORKSPACE/baseline"
-
-            # Use SLO-specific .dockerignore
-            cp tests/slo_workloads/.dockerignore .dockerignore
-
-            docker build -t ydb-app-baseline \
-              --build-arg REF="${{ steps.baseline.outputs.ref }}" \
-              --build-arg PRESET=release-test-${{ matrix.compiler }} \
-              -f tests/slo_workloads/Dockerfile .
-
-            # Clean up .dockerignore
-            rm -f .dockerignore
-          else
-            echo "No baseline app Dockerfile found"
-            exit 1
+          cp tests/slo_workloads/.dockerignore .dockerignore
+          if ! docker build \
+                --platform linux/amd64 \
+                --build-arg PRESET=${{ matrix.sdk.preset }} \
+                -t ydb-app-baseline \
+                -f tests/slo_workloads/Dockerfile \
+                .
+          then
+            echo "Baseline build failed; reusing current image"
+            docker tag ydb-app-current ydb-app-baseline
           fi
+          rm -f .dockerignore
 
-          docker system prune -f --volumes
-          docker builder prune -af
-
-          echo "Final disk space after builds:"
-          df -h
-
-      - name: Initialize YDB SLO
-        uses: ydb-platform/ydb-slo-action/init@main
+      - name: Run SLO Tests
+        uses: ydb-platform/ydb-slo-action/init@v2
+        timeout-minutes: 30
         with:
-          github_issue: ${{ github.event.inputs.github_issue }}
+          github_issue: ${{ github.event.pull_request.number }}
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          workload_name: ${{ matrix.workload }}-${{ matrix.compiler }}
+          workload_name: ${{ matrix.sdk.name }}
+          workload_duration: "600"
           workload_current_ref: ${{ github.head_ref || github.ref_name }}
+          workload_current_image: ydb-app-current
+          workload_current_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100
           workload_baseline_ref: ${{ steps.baseline.outputs.ref }}
-
-      - name: Prepare SLO Database
-        run: |
-          echo "Preparing SLO database..."
-          docker run --rm --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            ydb-app-current --connection-string grpc://ydb:2136/?database=/Root/testdb create --dont-push
-
-      - name: Run SLO Tests (parallel)
-        timeout-minutes: 15
-        run: |
-          DURATION=${{ inputs.slo_workload_duration_seconds || 600 }}
-          READ_RPS=${{ inputs.slo_workload_read_max_rps || 1000 }}
-          WRITE_RPS=${{ inputs.slo_workload_write_max_rps || 100 }}
-
-          ARGS="--connection-string grpc://ydb:2136/?database=/Root/testdb run \
-            --metrics-push-url http://prometheus:9090/api/v1/otlp/v1/metrics \
-            --time $DURATION \
-            --read-rps $READ_RPS \
-            --write-rps $WRITE_RPS \
-            --read-timeout 100 \
-            --write-timeout 100"
-
-          echo "Starting ydb-app-current..."
-          docker run -d \
-            --name ydb-app-current \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            ydb-app-current $ARGS
-
-          echo "Starting ydb-app-baseline..."
-          docker run -d \
-            --name ydb-app-baseline \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            ydb-app-baseline $ARGS
-
-          # Show initial logs
-          echo ""
-          echo "==================== INITIAL CURRENT LOGS ===================="
-          docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
-          echo ""
-          echo "==================== INITIAL BASELINE LOGS ===================="
-          docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
-          echo ""
-
-          # Wait for workloads to complete
-          echo "Waiting for workloads to complete (${DURATION}s)..."
-          sleep ${DURATION}
-
-          # Stop containers after workload duration and wait for graceful shutdown
-          echo "Stopping containers after ${DURATION}s..."
-          docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true
-
-          # Force kill if still running
-          docker kill ydb-app-current ydb-app-baseline 2>&1 || true
-
-          # Check exit codes
-          CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")
-          BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "0")
-
-          echo "Current container exit code: $CURRENT_EXIT"
-          echo "Baseline container exit code: $BASELINE_EXIT"
-
-          # Show final logs
-          echo ""
-          echo "==================== FINAL CURRENT LOGS ===================="
-          docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
-          echo ""
-          echo "==================== FINAL BASELINE LOGS ===================="
-          docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
-          echo ""
-
-          echo "SUCCESS: Workloads completed successfully"
-
-      - if: always()
-        name: Store logs
-        run: |
-          docker logs ydb-app-current > current.log 2>&1 || echo "No current container"
-          docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container"
-
-      - if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.workload }}-${{ matrix.compiler }}-slo-cpp-sdk-logs
-          path: |
-            ./current.log
-            ./baseline.log
-          retention-days: 1
+          workload_baseline_image: ydb-app-baseline
+          workload_baseline_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100
diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml
index 0a7c2e3483..b2bbee5172 100644
--- a/.github/workflows/slo_report.yml
+++ b/.github/workflows/slo_report.yml
@@ -7,17 +7,36 @@ on:
       - completed
 
 jobs:
-  ydb-slo-action-report:
+  publish-slo-report:
+    if: github.event.workflow_run.conclusion == 'success'
     runs-on: ubuntu-latest
     name: Publish YDB SLO Report
     permissions:
       checks: write
       contents: read
       pull-requests: write
-    if: github.event.workflow_run.conclusion == 'success'
     steps:
       - name: Publish YDB SLO Report
-        uses: ydb-platform/ydb-slo-action/report@main
+        uses: ydb-platform/ydb-slo-action/report@v2
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           github_run_id: ${{ github.event.workflow_run.id }}
+
+  remove-slo-label:
+    if: github.event.workflow_run.event == 'pull_request'
+    runs-on: ubuntu-latest
+    name: Remove SLO Label
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Remove SLO label from PR
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
+          REPO: ${{ github.event.workflow_run.repository.full_name }}
+        run: |
+          set -euo pipefail
+          PR=$(jq -r '.[0].number' <<<"$PRS")
+          if [ "$PR" != "null" ] && [ -n "$PR" ]; then
+            gh pr edit "$PR" --repo "$REPO" --remove-label SLO
+          fi

From 2d9a928d10ab416c7c2b92c1bbe1e7759367db31 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 17:11:59 +0300
Subject: [PATCH 03/11] tests/slo_workloads: address Copilot review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Counters: switch OperationsTotal/RetryAttemptsTotal from DoubleCounter
  to UInt64Counter. Operation counts and retry attempts are inherently
  integer; float representation could drift for very high cumulative
  values and doesn't match the Prometheus counter convention.
- Percentile gauges: publish 0.0 on empty-window intervals instead of
  returning early. OTel's sync Gauge holds the last Record() value and
  the periodic exporter re-emits it every collection cycle — without an
  explicit reset, gauges would look "stuck" at the last non-empty value
  after load stops, contradicting the per-second HDR reset semantics.
---
 tests/slo_workloads/utils/metrics.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 996a48780f..4506c1ca39 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -138,7 +138,7 @@ class TOtelMetricsPusher : public IMetricsPusher {
         const bool success = requestData.Status == NYdb::EStatus::SUCCESS;
         const std::string status = success ? "success" : "error";
 
-        OperationsTotal_->Add(1, MergeAttributes({
+        OperationsTotal_->Add(uint64_t{1}, MergeAttributes({
             {"operation_type", OperationType_},
             {"operation_status", status},
         }));
@@ -146,7 +146,7 @@ class TOtelMetricsPusher : public IMetricsPusher {
         // sdk_retry_attempts_total = total number of technical attempts
         // including the first one. TStatUnit counts only post-first attempts,
         // so add 1 to include the initial attempt.
-        RetryAttemptsTotal_->Add(static_cast<double>(requestData.RetryAttempts + 1),
+        RetryAttemptsTotal_->Add(requestData.RetryAttempts + 1,
             MergeAttributes({
                 {"operation_type", OperationType_},
             })
@@ -159,11 +159,11 @@ class TOtelMetricsPusher : public IMetricsPusher {
 
 private:
     void InitMetrics() {
-        OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total",
+        OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total",
             "Total number of operations, categorized by operation type and status."
         );
 
-        RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total",
+        RetryAttemptsTotal_ = Meter_->CreateUInt64Counter("sdk_retry_attempts_total",
             "Total number of retry attempts (including the first attempt), categorized by operation type."
         );
 
@@ -191,16 +191,18 @@ class TOtelMetricsPusher : public IMetricsPusher {
 
     void PublishPercentiles() {
         auto snapshot = Latency_.SnapshotAndReset();
-        if (!snapshot.HasData) {
-            return;
-        }
         auto attrs = MergeAttributes({
             {"operation_type", OperationType_},
             {"operation_status", "success"},
         });
-        LatencyP50_->Record(snapshot.P50, attrs);
-        LatencyP95_->Record(snapshot.P95, attrs);
-        LatencyP99_->Record(snapshot.P99, attrs);
+        // When no successful ops landed in the last second, publish 0.0
+        // for all percentiles so the gauges reset with the HDR window
+        // rather than appearing "stuck" at the last non-empty value (the
+        // OTel periodic exporter would otherwise re-emit the previous
+        // Record() value on every collection cycle).
+        LatencyP50_->Record(snapshot.HasData ? snapshot.P50 : 0.0, attrs);
+        LatencyP95_->Record(snapshot.HasData ? snapshot.P95 : 0.0, attrs);
+        LatencyP99_->Record(snapshot.HasData ? snapshot.P99 : 0.0, attrs);
     }
 
     std::map<std::string, std::string> MergeAttributes(const std::map<std::string, std::string>& metricAttrs) const {
@@ -216,8 +218,8 @@ class TOtelMetricsPusher : public IMetricsPusher {
     std::unique_ptr<opentelemetry::sdk::metrics::MeterProvider> MeterProvider_;
     std::shared_ptr<opentelemetry::metrics::Meter> Meter_;
 
-    std::unique_ptr<opentelemetry::metrics::Counter<double>> OperationsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<double>> RetryAttemptsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> RetryAttemptsTotal_;
     std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP50_;
     std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP95_;
     std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP99_;

From cbe3597d10ed9ac7f2b7af9229dd4ed7d1602f3f Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 17:27:27 +0300
Subject: [PATCH 04/11] tests/slo_workloads: make apt tolerant of PPA timeouts

Shared CI runners (and local docker builds) periodically hit
connection timeouts on ppa.launchpadcontent.net. Telling apt itself
to retry via Acquire::Retries=5 plus a 60 s connect timeout handles
the blip in-place without a shell retry loop.
---
 tests/slo_workloads/Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 65d9c7693b..ca9d77d5fe 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -2,6 +2,12 @@ FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
 
+# Make apt tolerant of transient PPA/mirror timeouts (shared runners see
+# ppa.launchpadcontent.net connection timeouts every few builds).
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \
+    echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries
+
 # Install software-properties-common for add-apt-repository
 RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
 

From 1799ac35f1c15a1ff84368c04beee6022a328a1c Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 18:06:21 +0300
Subject: [PATCH 05/11] ci(slo): make 30-min build resilient to network flakes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two separate issues were making every SLO PR a coin flip:

1. Any transient network error (GitHub release download, PPA timeout,
   abseil/protobuf/grpc tarball) kills the entire 30-min build. Each
   of the 8 wget calls and both apt-get steps is now retried:
   - wget gets --tries=5 --waitretry=15 --timeout=60
     --retry-connrefused --retry-on-http-error=500,502,503,504
     via a shared WGET_OPTS env var.
   - apt already has Acquire::Retries=5.

2. Every CI run does a full cold build. The Dockerfile's toolchain and
   dep layers (~25 min) never change between SDK commits, so caching
   them is trivially safe.
   - Switch slo.yml to docker/build-push-action@v6 with GHA type=gha
     cache export/import, scoped per preset.
   - First build on a runner still takes ~30 min; subsequent builds
     where only the SDK source changed should take ~3 min.
   - Baseline build uses continue-on-error + an explicit fallback
     step that retags ydb-app-current as ydb-app-baseline when the
     historical commit won't compile — replaces the inline shell
     if-else that docker/build-push-action can't express.
---
 .github/workflows/slo.yml      | 78 ++++++++++++++++++++--------------
 tests/slo_workloads/Dockerfile | 25 ++++++-----
 2 files changed, 59 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 4cdf4bfec8..0982b7125c 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -89,44 +89,56 @@ jobs:
           fetch-depth: 1
           submodules: true
 
-      - name: Build current workload image
-        working-directory: sdk-current
-        run: |
-          set -euxo pipefail
-          cp tests/slo_workloads/.dockerignore .dockerignore
-          docker build \
-            --platform linux/amd64 \
-            --build-arg PRESET=${{ matrix.sdk.preset }} \
-            -t ydb-app-current \
-            -f tests/slo_workloads/Dockerfile \
-            .
-          rm -f .dockerignore
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
 
-      - name: Build baseline workload image
-        working-directory: sdk-baseline
+      # .dockerignore lives under tests/slo_workloads/; buildx expects it at
+      # the context root, so place a copy in each SDK checkout.
+      - name: Stage .dockerignore
         run: |
           set -euxo pipefail
-          # If the historical commit lacks the SLO workload files or can't
-          # compile, fall back to the current image so the SLO run is still
-          # comparable against itself rather than silently failing.
-          if [ ! -f tests/slo_workloads/Dockerfile ]; then
-            echo "Baseline commit has no SLO Dockerfile; reusing current image"
-            docker tag ydb-app-current ydb-app-baseline
-            exit 0
+          cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
+          if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then
+            cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
           fi
 
-          cp tests/slo_workloads/.dockerignore .dockerignore
-          if ! docker build \
-                --platform linux/amd64 \
-                --build-arg PRESET=${{ matrix.sdk.preset }} \
-                -t ydb-app-baseline \
-                -f tests/slo_workloads/Dockerfile \
-                .
-          then
-            echo "Baseline build failed; reusing current image"
-            docker tag ydb-app-current ydb-app-baseline
-          fi
-          rm -f .dockerignore
+      # A clean build of the SLO image takes ~30 min because the Dockerfile
+      # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.
+      # The GHA cache lets subsequent runs reuse every layer up to the SDK
+      # source COPY, so only the actual workload link step reruns (~3 min).
+      - name: Build current workload image
+        uses: docker/build-push-action@v6
+        with:
+          context: sdk-current
+          file: sdk-current/tests/slo_workloads/Dockerfile
+          platforms: linux/amd64
+          tags: ydb-app-current
+          load: true
+          build-args: PRESET=${{ matrix.sdk.preset }}
+          cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }}
+          cache-to: type=gha,mode=max,scope=slo-${{ matrix.sdk.preset }}
+
+      - name: Build baseline workload image
+        id: baseline-build
+        continue-on-error: true
+        uses: docker/build-push-action@v6
+        with:
+          context: sdk-baseline
+          file: sdk-baseline/tests/slo_workloads/Dockerfile
+          platforms: linux/amd64
+          tags: ydb-app-baseline
+          load: true
+          build-args: PRESET=${{ matrix.sdk.preset }}
+          cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }}
+
+      # If the historical commit lacks the SLO Dockerfile or can't compile,
+      # reuse the current image so the SLO run is still comparable against
+      # itself rather than failing outright.
+      - name: Fall back to current image for baseline
+        if: steps.baseline-build.outcome == 'failure'
+        run: |
+          echo "Baseline build failed; reusing current image as baseline."
+          docker tag ydb-app-current ydb-app-baseline
 
       - name: Run SLO Tests
         uses: ydb-platform/ydb-slo-action/init@v2
diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index ca9d77d5fe..7f8ea5e78e 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -2,12 +2,15 @@ FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
 
-# Make apt tolerant of transient PPA/mirror timeouts (shared runners see
-# ppa.launchpadcontent.net connection timeouts every few builds).
+# Every RUN that hits the network retries on transient failures so one
+# flake doesn't throw away 30 min of previous build work. apt gets five
+# Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \
     echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries
 
+ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504"
+
 # Install software-properties-common for add-apt-repository
 RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
 
@@ -20,7 +23,7 @@ RUN apt-get -y update && apt-get -y install \
 
 # Install CMake
 ENV CMAKE_VERSION=3.27.7
-RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \
+RUN wget $WGET_OPTS https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \
     -q -O cmake-install.sh \
     && chmod u+x cmake-install.sh \
     && ./cmake-install.sh --skip-license --prefix=/usr/local \
@@ -28,7 +31,7 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cm
 
 # Install LLVM
 ENV LLVM_VERSION=16
-RUN wget https://apt.llvm.org/llvm.sh && \
+RUN wget $WGET_OPTS https://apt.llvm.org/llvm.sh && \
     chmod u+x llvm.sh && \
     ./llvm.sh ${LLVM_VERSION} && \
     rm llvm.sh
@@ -45,7 +48,7 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 10000 && \
 # Install abseil-cpp
 ENV ABSEIL_CPP_VERSION=20230802.0
 ENV ABSEIL_CPP_INSTALL_DIR=/root/ydb_deps/absl
-RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \
     tar -xvzf abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz && cd abseil-cpp-${ABSEIL_CPP_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DABSL_PROPAGATE_CXX_STD=ON .. && \
@@ -56,7 +59,7 @@ RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/ab
 # Install protobuf
 ENV PROTOBUF_VERSION=3.21.12
 ENV PROTOBUF_INSTALL_DIR=/root/ydb_deps/protobuf
-RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \
     tar -xvzf protobuf-${PROTOBUF_VERSION}.tar.gz && cd protobuf-${PROTOBUF_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_INSTALL=ON -Dprotobuf_ABSL_PROVIDER=package .. && \
@@ -67,7 +70,7 @@ RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffe
 # Install grpc
 ENV GRPC_VERSION=1.54.3
 ENV GRPC_INSTALL_DIR=/root/ydb_deps/grpc
-RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \
     tar -xvzf grpc-${GRPC_VERSION}.tar.gz && cd grpc-${GRPC_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_PREFIX_PATH="${ABSEIL_CPP_INSTALL_DIR};${PROTOBUF_INSTALL_DIR}" \
@@ -84,7 +87,7 @@ RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/ref
 # Install base64
 ENV BASE64_VERSION=0.5.2
 ENV BASE64_INSTALL_DIR=/root/ydb_deps/base64
-RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \
     tar -xvzf base64-${BASE64_VERSION}.tar.gz && cd base64-${BASE64_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \
@@ -95,7 +98,7 @@ RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/arc
 # Install brotli
 ENV BROTLI_VERSION=1.1.0
 ENV BROTLI_INSTALL_DIR=/root/ydb_deps/brotli
-RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \
     tar -xvzf brotli-${BROTLI_VERSION}.tar.gz && cd brotli-${BROTLI_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \
@@ -106,7 +109,7 @@ RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/arc
 # Install jwt-cpp
 ENV JWT_CPP_VERSION=0.7.0
 ENV JWT_CPP_INSTALL_DIR=/root/ydb_deps/jwt-cpp
-RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \
     tar -xvzf jwt-cpp-${JWT_CPP_VERSION}.tar.gz && cd jwt-cpp-${JWT_CPP_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \
@@ -116,7 +119,7 @@ RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-
 
 # Install ccache 4.8.1 or above
 ENV CCACHE_VERSION=4.8.1
-RUN wget https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \
+RUN wget $WGET_OPTS https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \
     && tar -xf ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \
     && cp ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/local/bin/ \
     && rm -rf ccache-${CCACHE_VERSION}-linux-x86_64 ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz

From da782d4c559470ede50c9857bbdb045d9bf4bb4f Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 18:07:02 +0300
Subject: [PATCH 06/11] tests/slo_workloads: retry PPA/apt steps at the shell
 level

The earlier Acquire::Retries=5 tweak covered HTTP-level errors but
didn't handle TCP connect timeouts to ppa.launchpadcontent.net, which
is the actual failure mode observed on shared CI runners. Wrap both
the add-apt-repository step and the main apt-get install in shell
retry loops (5 attempts, 15/30/45/60/75 s backoff) so a CDN blip no
longer kills the 30-min build.
---
 tests/slo_workloads/Dockerfile | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 7f8ea5e78e..7680560cac 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -11,15 +11,34 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
 
 ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504"
 
-# Install software-properties-common for add-apt-repository
-RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
+# Install software-properties-common and add the gcc-13 PPA.
+# Acquire::Retries only retries HTTP errors; TCP connect timeouts to
+# ppa.launchpadcontent.net still drop through and kill the step. Wrap the
+# whole command in a shell retry loop with exponential backoff so a CDN
+# blip doesn't throw away 30 minutes of downstream build work.
+RUN for i in 1 2 3 4 5; do \
+        apt-get -y update && \
+        apt-get -y install software-properties-common && \
+        add-apt-repository -y ppa:ubuntu-toolchain-r/test && \
+        apt-get -y update && \
+        break; \
+        echo "add-apt-repository attempt $i failed; sleeping $((i * 15))s"; \
+        sleep $((i * 15)); \
+    done && \
+    apt-cache show gcc-13 > /dev/null  # fail fast if PPA never came up
 
 # Install C++ tools and libraries
-RUN apt-get -y update && apt-get -y install \
-    git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \
-    rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \
-    libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN for i in 1 2 3 4 5; do \
+        apt-get -y install \
+            git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \
+            rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \
+            libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 && \
+        break; \
+        echo "apt-get install attempt $i failed; sleeping $((i * 15))s"; \
+        sleep $((i * 15)); \
+        apt-get -y update || true; \
+    done && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # Install CMake
 ENV CMAKE_VERSION=3.27.7

From a8da8bb6de1fcec95e257bd7fc54f8e790d7deeb Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 20:07:39 +0300
Subject: [PATCH 07/11] tests/slo_workloads: forward run-phase args in implicit
 All mode

The v2 SLO action invokes the workload as `slo-key-value <run-args>`
without a subcommand keyword. The global parser used to error on
unknown long options like --read-rps; now it tolerates them and the
implicit All branch passes the leftover argv to the run phase.
---
 tests/slo_workloads/utils/utils.cpp | 33 +++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index 78231c8e1c..89a642b61d 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -161,6 +161,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     opts.SetFreeArgsMin(0);
     opts.SetFreeArgTitle(0, "<COMMAND>", GetCmdList());
     opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER;
+    // Run-phase options (--read-rps, --write-rps, …) reach DoMain when the
+    // caller invokes the workload without an explicit subcommand (the v2 SLO
+    // action contract). Tolerate them here so the global parser stops at the
+    // first unknown option instead of erroring; they are forwarded to the
+    // run phase below.
+    opts.AllowUnknownLongOptions_ = true;
 
     TOptsParseResult res(&opts, argc, argv);
     size_t freeArgsPos = res.GetFreeArgsPos();
@@ -169,8 +175,14 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
 
     ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All;
     if (command == ECommandType::Unknown) {
-        Cerr << "Unknown command '" << *argv << "'" << Endl;
-        return EXIT_FAILURE;
+        if (argv[0][0] == '-') {
+            // First leftover token is an option, not a subcommand keyword:
+            // treat as implicit All mode and let the run phase parse it.
+            command = ECommandType::All;
+        } else {
+            Cerr << "Unknown command '" << *argv << "'" << Endl;
+            return EXIT_FAILURE;
+        }
     }
 
     if (prefix.empty()) {
@@ -227,18 +239,25 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             break;
         case ECommandType::All: {
             Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl;
-            // Synthesize argv with a fake program name so the inner NLastGetopt
-            // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0]
-            // as the program name and parse zero real args.
+            // Forward leftover argv to the run phase so options like
+            // --read-rps / --write-rps take effect. argv[0] here is the first
+            // run-phase option (no subcommand keyword was supplied), so
+            // prepend a synthetic program name for ParseOptionsRun.
             char programName[] = "slo";
-            char* fakeArgv[] = { programName, nullptr };
+            std::vector<char*> runArgv;
+            runArgv.reserve(argc + 1);
+            runArgv.push_back(programName);
+            for (int i = 0; i < argc; ++i) {
+                runArgv.push_back(argv[i]);
+            }
             int fakeArgc = 1;
+            char* fakeArgv[] = { programName, nullptr };
 
             Cout << "[all] Launching create command..." << Endl;
             result = create(dbOptions, fakeArgc, fakeArgv);
             if (!result) {
                 Cout << "[all] Launching run command..." << Endl;
-                result = run(dbOptions, fakeArgc, fakeArgv);
+                result = run(dbOptions, static_cast<int>(runArgv.size()), runArgv.data());
             }
             Cout << "[all] Launching cleanup command..." << Endl;
             int cleanupRc = cleanup(dbOptions, fakeArgc);

From 4d27c9f612284ccd7a5e611c70b069ea2980b29f Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 00:20:53 +0300
Subject: [PATCH 08/11] ci(slo): overlay current workload harness onto baseline
 checkout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The baseline image used to be built from the merge-base checkout's
tests/slo_workloads/, so any harness change on the PR (Dockerfile,
CLI parser, …) was absent from the baseline. SDK comparison should
only vary the library, not the harness — so reuse current's harness
on both sides.
---
 .github/workflows/slo.yml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 0982b7125c..63145bba8f 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -92,15 +92,19 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      # .dockerignore lives under tests/slo_workloads/; buildx expects it at
-      # the context root, so place a copy in each SDK checkout.
-      - name: Stage .dockerignore
+      # Use current's workload harness (Dockerfile, sources, .dockerignore) for
+      # both builds so only the SDK library differs between current and
+      # baseline. Without this the baseline image picks up the harness from
+      # the merge-base commit, which can lag behind the action's contract.
+      # buildx also expects .dockerignore at the context root, not under
+      # tests/, so copy it up in each checkout.
+      - name: Stage workload harness
         run: |
           set -euxo pipefail
+          rm -rf sdk-baseline/tests/slo_workloads
+          cp -a sdk-current/tests/slo_workloads sdk-baseline/tests/slo_workloads
           cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
-          if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then
-            cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
-          fi
+          cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
 
       # A clean build of the SLO image takes ~30 min because the Dockerfile
       # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.

From a6caf4ef2a74dc198b9c36051445037f89dcefbe Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 00:27:27 +0300
Subject: [PATCH 09/11] ci(slo): wire ccache into the cmake build via BuildKit
 cache mount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cmake configure/build now run under ccache as the C/C++ compiler
launcher, with /root/.ccache exposed as a BuildKit cache mount so
state persists across runs through cache-to=type=gha,mode=max
(BuildKit ≥0.13 exports cache mounts under mode=max). Cold runs
incur the usual full compile; warm runs reuse object hashes and
should drop cmake --build from ~14 min to a few minutes.
---
 tests/slo_workloads/Dockerfile | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 7680560cac..f87be8f8df 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -1,7 +1,16 @@
+# syntax=docker/dockerfile:1.7
 FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
 
+# ccache settings consumed by the configure/build steps below. The cache dir
+# is materialised by the BuildKit cache mount on those RUN steps; values
+# elsewhere in the image are inert.
+ENV CCACHE_DIR=/root/.ccache
+ENV CCACHE_MAXSIZE=2G
+ENV CCACHE_COMPRESS=true
+ENV CCACHE_COMPILERCHECK=content
+
 # Every RUN that hits the network retries on transient failures so one
 # flake doesn't throw away 30 min of previous build work. apt gets five
 # Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS.
@@ -147,7 +156,13 @@ COPY . /ydb-cpp-sdk
 WORKDIR /ydb-cpp-sdk
 RUN rm -rf build
 
-RUN cmake --preset ${PRESET}
-RUN cmake --build --preset default --target slo-key-value
+RUN --mount=type=cache,target=/root/.ccache,sharing=locked \
+    cmake --preset ${PRESET} \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+RUN --mount=type=cache,target=/root/.ccache,sharing=locked \
+    ccache --zero-stats >/dev/null \
+    && cmake --build --preset default --target slo-key-value \
+    && ccache --show-stats
 
 ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"]

From 09e662f97d5c2dd8c8a0b5b9e4c78fe745c2141e Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 03:17:03 +0300
Subject: [PATCH 10/11] tests/slo_workloads: read prefix from YDB_DATABASE when
 connection-string lacks ?database=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v2 SLO action provides YDB_CONNECTION_STRING in path form
(grpc://host:port/Root/testdb), which GetDatabase can't parse, so
prefix stayed empty and create issued CreateTable("key_value")
without the database root → BAD_REQUEST. Falling back to the
YDB_DATABASE env var (which the action also sets) restores the
correct table path.
---
 tests/slo_workloads/utils/utils.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index 89a642b61d..f52461283b 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -188,6 +188,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     if (prefix.empty()) {
         prefix = GetDatabase(connectionString);
     }
+    if (prefix.empty()) {
+        // YDB SLO action sets YDB_CONNECTION_STRING in path form
+        // (grpc://host:port/Root/testdb), which GetDatabase can't parse.
+        // Fall back to YDB_DATABASE which the action sets alongside it.
+        prefix = GetEnv("YDB_DATABASE");
+    }
 
     if (!ParseToken(token, tokenFile)) {
         return EXIT_FAILURE;

From a581162623f4982bcf4e81a7321328d5146431f6 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 03:23:57 +0300
Subject: [PATCH 11/11] ci(slo): persist ccache between runs via actions/cache
 + buildkit-cache-dance

cache-to: type=gha,mode=max exports layer cache but not the contents
of --mount=type=cache, so the ccache mount started empty on every
run (0% hit rate observed). Restore /root/.ccache from a host dir
backed by actions/cache, inject it into BuildKit before each build
via the cache-dance action, and extract the updated state for the
next run. Same scope is shared by current and baseline since the
SDK code overlap dominates ccache hit potential.
---
 .github/workflows/slo.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 63145bba8f..db9e4d9841 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -106,6 +106,33 @@ jobs:
           cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
           cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
 
+      # `cache-to: type=gha` does NOT export `--mount=type=cache` content, so
+      # ccache state is lost between runs. Persist /root/.ccache via host
+      # directory + cache-dance: actions/cache restores the host dir, the
+      # dance injects it into the BuildKit cache mount before the build and
+      # extracts the updated state afterwards for the next save.
+      - name: Restore ccache
+        id: ccache
+        uses: actions/cache@v4
+        with:
+          path: ccache
+          key: slo-ccache-${{ matrix.sdk.preset }}-${{ github.run_id }}
+          restore-keys: |
+            slo-ccache-${{ matrix.sdk.preset }}-
+
+      - name: Inject ccache into BuildKit
+        uses: reproducible-containers/buildkit-cache-dance@v3.1.2
+        with:
+          cache-map: |
+            {
+              "ccache": "/root/.ccache"
+            }
+          # Always extract so newly-compiled TUs from this run are saved by
+          # actions/cache (key uses ${{ github.run_id }}, so each run gets
+          # its own snapshot). Without extraction the cache stays frozen at
+          # whatever was first persisted.
+          skip-extraction: false
+
       # A clean build of the SLO image takes ~30 min because the Dockerfile
       # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.
       # The GHA cache lets subsequent runs reuse every layer up to the SDK