From a3396279d907d2380e3bb4707caa7abb676299b0 Mon Sep 17 00:00:00 2001 From: tewbo Date: Mon, 11 May 2026 07:24:48 +0300 Subject: [PATCH 01/30] add 6 metrics --- .../opentelemetry/otel_metrics_example.py | 91 ++++++++ ydb/aio/query/pool.py | 27 +++ ydb/aio/query/session.py | 8 + ydb/opentelemetry/__init__.py | 24 +- ydb/opentelemetry/_plugin.py | 190 +++++++++++++++ ydb/opentelemetry/metrics.py | 220 ++++++++++++++++++ ydb/opentelemetry/tracing.py | 7 +- ydb/query/pool.py | 30 ++- ydb/query/session.py | 20 +- 9 files changed, 608 insertions(+), 9 deletions(-) create mode 100644 examples/opentelemetry/otel_metrics_example.py create mode 100644 ydb/opentelemetry/_plugin.py create mode 100644 ydb/opentelemetry/metrics.py diff --git a/examples/opentelemetry/otel_metrics_example.py b/examples/opentelemetry/otel_metrics_example.py new file mode 100644 index 000000000..fb893b29e --- /dev/null +++ b/examples/opentelemetry/otel_metrics_example.py @@ -0,0 +1,91 @@ +"""OpenTelemetry metrics demo for YDB client-side metrics. + +The example exports SDK metrics to the OpenTelemetry Collector via OTLP. The +collector exposes them for Prometheus, which is configured in compose-e2e.yaml. +""" + +from __future__ import annotations + +import asyncio +import os +import signal +from types import FrameType +from typing import Callable, Optional + +import ydb +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource +from ydb.opentelemetry import enable_registry + + +def _env(name: str, default: str) -> str: + value = os.environ.get(name) + return value if value else default + + +def _create_stop_event() -> asyncio.Event: + stop = asyncio.Event() + loop = asyncio.get_running_loop() + request_stop: Callable[[], None] = stop.set + handle_stop_signal: Callable[[int, Optional[FrameType]], None] = lambda signum, frame: stop.set() + + for sig in (signal.SIGINT, signal.SIGTERM): + try: + loop.add_signal_handler(sig, request_stop) + except NotImplementedError: + signal.signal(sig, handle_stop_signal) + + return stop + + +async def _run_workload(pool: ydb.aio.QuerySessionPool, stop: asyncio.Event) -> None: + counter = 0 + while not stop.is_set(): + counter += 1 + result_sets = await asyncio.gather( + *( + pool.execute_with_retries( + "SELECT $session_id AS session_id, $iteration AS iteration", + parameters={ + "$session_id": (i, ydb.PrimitiveType.Uint64), + "$iteration": (counter, ydb.PrimitiveType.Uint64), + }, + ) + for i in range(4) + ) + ) + session_ids = [int(list(result[0].rows)[0]["session_id"]) for result in result_sets] + print(f"completed concurrent queries: {session_ids}") + await asyncio.sleep(2) + + +async def main() -> None: + endpoint = _env("YDB_ENDPOINT", "grpc://localhost:2136") + database = _env("YDB_DATABASE", "/local") + otlp_endpoint = _env("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + + resource = Resource(attributes={"service.name": _env("OTEL_SERVICE_NAME", "ydb-client-metrics-example")}) + metric_reader = PeriodicExportingMetricReader( + OTLPMetricExporter(endpoint=otlp_endpoint), + export_interval_millis=1000, + ) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + enable_registry(meter_provider) + + stop = _create_stop_event() + + try: + async with ydb.aio.Driver(endpoint=endpoint, database=database, disable_discovery=True) as driver: + await driver.wait(timeout=60) + + async with ydb.aio.QuerySessionPool(driver, size=4) as pool: + print("YDB client metrics are being exported. Open Prometheus and query ydb_query_session_count.") + await _run_workload(pool, stop) + finally: + meter_provider.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index a0d9d93c9..21ce110fc 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -2,6 +2,8 @@ import asyncio import logging +import itertools +import time from typing import ( Callable, Optional, @@ -22,11 +24,19 @@ from ...query.base import QueryClientSettings from ... import convert from ... import issues +from ...opentelemetry.metrics import ( + record_query_session_count, + record_query_session_create_time, + record_query_session_pending_requests, + record_query_session_timeout, +) from ..._grpc.grpcwrapper import common_utils from ..._grpc.grpcwrapper import ydb_query_public_types as _ydb_query_public logger = logging.getLogger(__name__) +_pool_name_counter = itertools.count(1) + class QuerySessionPool: """QuerySessionPool is an object to simplify operations with sessions of Query Service.""" @@ -38,11 +48,13 @@ def __init__( *, query_client_settings: Optional[QueryClientSettings] = None, loop: Optional[asyncio.AbstractEventLoop] = None, + name: Optional[str] = None, ): """ :param driver: A driver instance :param size: Size of session pool :param query_client_settings: ydb.QueryClientSettings object to configure QueryService behavior + :param name: Optional session pool name for OpenTelemetry metrics. """ self._driver = driver @@ -52,10 +64,15 @@ def __init__( self._current_size = 0 self._loop = asyncio.get_running_loop() if loop is None else loop self._query_client_settings = query_client_settings + self._metrics_pool_name = name or "query-session-pool-%d" % next(_pool_name_counter) async def _create_new_session(self): session = QuerySession(self._driver, settings=self._query_client_settings) + session._metrics_pool_name = self._metrics_pool_name + session._metrics_state = "used" + start_time = time.monotonic() await session.create() + record_query_session_create_time(time.monotonic() - start_time, self._metrics_pool_name) logger.debug(f"New session was created for pool. Session id: {session.session_id}") return session @@ -81,6 +98,7 @@ async def acquire(self, timeout: Optional[float] = None) -> QuerySession: pass if session is None and self._current_size == self._size: + record_query_session_pending_requests(1, self._metrics_pool_name) queue_get = asyncio.ensure_future(self._queue.get()) task_stop = asyncio.ensure_future(self._should_stop.wait()) task_timeout = ( @@ -97,6 +115,8 @@ async def acquire(self, timeout: Optional[float] = None) -> QuerySession: if not cancelled and not queue_get.exception(): await self.release(queue_get.result()) raise + finally: + record_query_session_pending_requests(-1, self._metrics_pool_name) task_stop.cancel() if task_timeout is not None: @@ -110,12 +130,16 @@ async def acquire(self, timeout: Optional[float] = None) -> QuerySession: cancelled = queue_get.cancel() if not cancelled and not queue_get.exception(): await self.release(queue_get.result()) + record_query_session_timeout(self._metrics_pool_name) raise issues.SessionPoolEmpty("Timeout on acquire session") session = queue_get.result() if session is not None: if session.is_active: + record_query_session_count(-1, self._metrics_pool_name, "idle") + session._metrics_state = "used" + record_query_session_count(1, self._metrics_pool_name, "used") logger.debug(f"Acquired active session from queue: {session.session_id}") return session else: @@ -137,6 +161,9 @@ async def acquire(self, timeout: Optional[float] = None) -> QuerySession: async def release(self, session: QuerySession) -> None: """Release a session back to Session Pool.""" + record_query_session_count(-1, self._metrics_pool_name, "used") + session._metrics_state = "idle" + record_query_session_count(1, self._metrics_pool_name, "idle") self._queue.put_nowait(session) logger.debug("Session returned to queue: %s", session.session_id) diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index b776b6382..081ad2baa 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -13,6 +13,7 @@ from .transaction import QueryTxContext from .. import _utilities from ... import issues +from ...opentelemetry.metrics import record_query_session_count from ...settings import BaseRequestSettings from ..._grpc.grpcwrapper import common_utils from ..._grpc.grpcwrapper import ydb_query_public_types as _ydb_query_public @@ -110,6 +111,13 @@ async def create(self, settings: Optional[BaseRequestSettings] = None) -> "Query await self._create_call(settings=settings) set_peer_attributes(span, self._peer) await self._attach() + if not getattr(self, "_metrics_counted", False): + record_query_session_count( + 1, + pool_name=getattr(self, "_metrics_pool_name", None), + state=getattr(self, "_metrics_state", "used"), + ) + self._metrics_counted = True return self diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index fc058d0d8..a6d379d1d 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -13,7 +13,7 @@ def enable_tracing(tracer=None): ``ydb.sdk`` from the global tracer provider will be used. """ try: - from ydb.opentelemetry.plugin import _enable_tracing + from ydb.opentelemetry._plugin import _enable_tracing except ImportError: raise ImportError( "OpenTelemetry packages are required for tracing support. " @@ -26,11 +26,29 @@ def enable_tracing(tracer=None): def disable_tracing(): """Disable YDB OpenTelemetry hooks and allow :func:`enable_tracing` to run again.""" try: - from ydb.opentelemetry.plugin import _disable_tracing + from ydb.opentelemetry._plugin import _disable_tracing except ImportError: return _disable_tracing() -__all__ = ["disable_tracing", "enable_tracing"] +def enable_registry(meter_provider=None): + """Enable OpenTelemetry metrics collection for YDB SDK client metrics. + + Args: + meter_provider: Optional OpenTelemetry MeterProvider. If not provided, + the global OpenTelemetry meter provider is used. + """ + try: + from ydb.opentelemetry._plugin import _enable_metrics + except ImportError: + raise ImportError( + "OpenTelemetry packages are required for metrics support. " + "Install them with: pip install ydb[opentelemetry]" + ) from None + + _enable_metrics(meter_provider) + + +__all__ = ["disable_tracing", "enable_registry", "enable_tracing"] diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py new file mode 100644 index 000000000..c7485f5f2 --- /dev/null +++ b/ydb/opentelemetry/_plugin.py @@ -0,0 +1,190 @@ +"""OpenTelemetry bridge for YDB.""" + +from opentelemetry import context as otel_context +from opentelemetry import metrics +from opentelemetry import trace +from opentelemetry.propagate import inject +from opentelemetry.trace import StatusCode +from opentelemetry.metrics import Observation + +from ydb import issues +from ydb.issues import StatusCode as YdbStatusCode +from ydb.opentelemetry.metrics import _metrics_registry, create_metrics_operation +from ydb.opentelemetry.tracing import _registry as _tracing_registry + +# YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. +_TRANSPORT_STATUSES = frozenset( + { + YdbStatusCode.CONNECTION_LOST, + YdbStatusCode.CONNECTION_FAILURE, + YdbStatusCode.DEADLINE_EXCEEDED, + YdbStatusCode.CLIENT_INTERNAL_ERROR, + YdbStatusCode.UNIMPLEMENTED, + } +) + +_tracer = None +_meter = None +_enabled = False + +_KIND_MAP = { + "client": trace.SpanKind.CLIENT, + "internal": trace.SpanKind.INTERNAL, +} + + +def _otel_metadata_hook(): + """Inject W3C Trace Context into outgoing gRPC metadata using the active OTel context.""" + headers = {} + inject(headers) + return list(headers.items()) + + +def _set_error_on_span(span, exception): + if isinstance(exception, issues.Error) and exception.status is not None: + span.set_attribute("db.response.status_code", exception.status.name) + error_type = "transport_error" if exception.status in _TRANSPORT_STATUSES else "ydb_error" + else: + error_type = type(exception).__qualname__ + + span.set_attribute("error.type", error_type) + span.set_status(StatusCode.ERROR, str(exception)) + span.record_exception(exception) + + +class _AttachContext: + """Make a span the active OTel context for a ``with`` block, without ending it. + + Used around the initial gRPC call of a streaming RPC: the span outlives the + ``with`` block — the result iterator owns ``end()``. For non-streaming RPCs + use ``with create_ydb_span(...)`` directly. + """ + + def __init__(self, raw_span): + self._raw = raw_span + self._token = None + + def __enter__(self): + ctx = trace.set_span_in_context(self._raw) + self._token = otel_context.attach(ctx) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._token is not None: + otel_context.detach(self._token) + self._token = None + return False + + +class TracingSpan: + """Wrapper around an OTel span. + + As context manager: ``__enter__`` attaches the OTel context (so child spans + nest correctly and ``inject()`` sees this span when building gRPC metadata) + and ``__exit__`` detaches and ends the span. Used by Commit / Rollback / + RunWithRetry / Try and similar single-shot operations. + + For ExecuteQuery streams the span outlives the ``with`` block: call + :meth:`attach_context` around the initial gRPC call only, and let the result + iterator own ``end()``. + """ + + def __init__(self, span, name, attributes): + self._span = span + self._otel_context_token = None + self._metrics_operation = create_metrics_operation(name, attributes) + + def set_error(self, exception): + _set_error_on_span(self._span, exception) + self._metrics_operation.set_error(exception) + + def set_attribute(self, key, value): + self._span.set_attribute(key, value) + self._metrics_operation.set_attribute(key, value) + + def end(self): + self._span.end() + self._metrics_operation.end() + + def attach_context(self): + return _AttachContext(self._span) + + def __enter__(self): + ctx = trace.set_span_in_context(self._span) + self._otel_context_token = otel_context.attach(ctx) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._otel_context_token is not None: + otel_context.detach(self._otel_context_token) + self._otel_context_token = None + if exc_val is not None: + self.set_error(exc_val) + self.end() + return False + + +def _create_span(name, attributes=None, kind=None): + attrs = attributes or {} + span = _tracer.start_span( + name, + kind=_KIND_MAP.get(kind, trace.SpanKind.CLIENT), + attributes=attrs, + ) + return TracingSpan(span, name, attrs) + + +def _enable_tracing(tracer=None): + global _enabled, _tracer + + if _enabled: + return + + _tracer = tracer if tracer is not None else trace.get_tracer("ydb.sdk") + _enabled = True + _tracing_registry.set_metadata_hook(_otel_metadata_hook) + _tracing_registry.set_create_span(_create_span) + + +def _disable_tracing(): + """Clear hooks and tracer; after this, :func:`~ydb.opentelemetry.enable_tracing` may be called again.""" + global _enabled, _tracer + + _tracing_registry.set_create_span(None) + _tracing_registry.set_metadata_hook(None) + _enabled = False + _tracer = None + + +def _create_query_session_count_callback(): + """Create callback for observable query session count metric.""" + + def observe_query_session_count(_): + values = _metrics_registry.get_query_session_count_values() + return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] + + return observe_query_session_count + + +def _enable_metrics(meter_provider): + global _meter + + if _meter is not None: + return + + if meter_provider is None: + _meter = metrics.get_meter("ydb.sdk") + elif hasattr(meter_provider, "get_meter"): + _meter = meter_provider.get_meter("ydb.sdk") + else: + raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") + + _metrics_registry.set_meter(_meter, _create_query_session_count_callback()) + + +def _disable_metrics(): + global _meter + + _metrics_registry.clear() + if _meter is not None: + _meter = None diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py new file mode 100644 index 000000000..e4131a558 --- /dev/null +++ b/ydb/opentelemetry/metrics.py @@ -0,0 +1,220 @@ +"""No-op-safe helpers for YDB OpenTelemetry metrics.""" + +import time +from typing import Any, Dict, Optional + +CLIENT_OPERATION_DURATION = "db.client.operation.duration" +CLIENT_OPERATION_FAILED = "ydb.client.operation.failed" +QUERY_SESSION_COUNT = "ydb.query.session.count" +QUERY_SESSION_CREATE_TIME = "ydb.query.session.create_time" +QUERY_SESSION_PENDING_REQUESTS = "ydb.query.session.pending_requests" +QUERY_SESSION_TIMEOUTS = "ydb.query.session.timeouts" + +_UNKNOWN_POOL = "unknown" + + +import threading + + +class MetricsRegistry: + def __init__(self) -> None: + self._instruments: Dict[str, Any] = {} + self._query_session_count_values: Dict[Any, int] = {} + self._query_session_count_lock = threading.Lock() + + def set_meter(self, meter: Any, observe_query_session_count_callback: Any) -> None: + self._instruments = { + CLIENT_OPERATION_DURATION: meter.create_histogram( + CLIENT_OPERATION_DURATION, + unit="s", + description="Duration of YDB client operations.", + ), + CLIENT_OPERATION_FAILED: meter.create_counter( + CLIENT_OPERATION_FAILED, + unit="{command}", + description="Number of failed YDB client operations.", + ), + QUERY_SESSION_COUNT: meter.create_observable_up_down_counter( + QUERY_SESSION_COUNT, + callbacks=[observe_query_session_count_callback], + unit="{connection}", + description="Number of open YDB query sessions.", + ), + QUERY_SESSION_CREATE_TIME: meter.create_histogram( + QUERY_SESSION_CREATE_TIME, + unit="s", + description="Duration of YDB query session creation.", + ), + QUERY_SESSION_PENDING_REQUESTS: meter.create_up_down_counter( + QUERY_SESSION_PENDING_REQUESTS, + unit="{request}", + description="Number of requests waiting for a YDB query session.", + ), + QUERY_SESSION_TIMEOUTS: meter.create_counter( + QUERY_SESSION_TIMEOUTS, + unit="{connection}", + description="Number of YDB query session acquisition timeouts.", + ), + } + + def clear(self) -> None: + self._instruments = {} + with self._query_session_count_lock: + self._query_session_count_values = {} + + def add(self, name: str, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: + """ + Record a metric value, accumulating for observable metrics or adding directly for others. + + For observable metrics, values are accumulated by attributes and sent via callback. + For regular metrics, values are added immediately to the instrument. + + Args: + name: Name of the metric. + value: Value to add (positive or negative). + attributes: Optional dictionary of metric attributes (labels). + """ + instrument = self._instruments.get(name) + if instrument is not None: + instrument.add(value, attributes=attributes or {}) + + def record(self, name: str, value: float, attributes: Optional[Dict[str, Any]] = None) -> None: + """ + Record a histogram or gauge metric value. + + Args: + name: Name of the metric. + value: Value to record. + attributes: Optional dictionary of metric attributes (labels). + """ + instrument = self._instruments.get(name) + if instrument is not None: + instrument.record(value, attributes=attributes or {}) + + def add_query_session_count(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: + attrs = tuple(sorted((attributes or {}).items())) + + with self._query_session_count_lock: + new_value = self._query_session_count_values.get(attrs, 0) + value + + self._query_session_count_values.pop(attrs, None) + self._query_session_count_values[attrs] = new_value + + def get_query_session_count_values(self) -> Dict[Any, int]: + with self._query_session_count_lock: + return dict(self._query_session_count_values) + + +_metrics_registry = MetricsRegistry() + + +def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: + return {"ydb.query.session.pool.name": pool_name or _UNKNOWN_POOL} + + +def _operation_attrs(operation_name: str, attributes: Dict[str, Any]) -> Dict[str, Any]: + return { + "db.system.name": attributes.get("db.system.name", "ydb"), + "db.namespace": attributes.get("db.namespace", ""), + "server.address": attributes.get("server.address", ""), + "server.port": attributes.get("server.port", 0), + "ydb.operation.name": operation_name, + } + + +def _response_status_code(exception: BaseException) -> str: + status = getattr(exception, "status", None) + if status is not None: + return getattr(status, "name", str(status)) + return type(exception).__qualname__ + + +class MetricsOperation: + """ + Context manager for tracking metrics of a single YDB operation. + + Records operation duration and captures errors. When the operation ends, + metrics are recorded to the registry with operation attributes. + + Attributes: + _name: Name of the operation. + _attributes: Dictionary of attributes attached to all metrics from this operation. + _start_time: Timestamp when the operation started (using monotonic). + _exception: Optional exception that occurred during operation execution. + _ended: Flag to ensure metrics are recorded only once. + """ + + def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None) -> None: + """ + Initialize a metrics operation. + + Args: + name: Name of the operation (e.g., 'ExecuteQuery', 'CreateSession'). + attributes: Optional dictionary of initial attributes for the operation. + """ + self._name = name + self._attributes = _operation_attrs(name, attributes or {}) + self._start_time = time.monotonic() + self._exception: Optional[BaseException] = None + self._ended = False + + def set_error(self, exception: BaseException) -> None: + """ + Record an exception that occurred during the operation. + + Args: + exception: The exception to record. + """ + self._exception = exception + + def set_attribute(self, key: str, value: Any) -> None: + self._attributes[key] = value + + def attach_context(self, end_on_exit=True) -> "MetricsOperation": + return self + + def end(self) -> None: + # todo: consider multi-thread calling + + if self._ended: + return + self._ended = True + + duration = time.monotonic() - self._start_time + _metrics_registry.record(CLIENT_OPERATION_DURATION, duration, self._attributes) + + if self._exception is not None: + attrs = dict(self._attributes) + attrs["db.response.status_code"] = _response_status_code(self._exception) + _metrics_registry.add(CLIENT_OPERATION_FAILED, 1, attrs) + + def __enter__(self) -> "MetricsOperation": + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + if exc_val is not None: + self.set_error(exc_val) + self.end() + return False + + +def create_metrics_operation(name: str, attributes: Optional[Dict[str, Any]] = None) -> MetricsOperation: + return MetricsOperation(name, attributes) + + +def record_query_session_count(delta: int, pool_name: Optional[str] = None, state: str = "used") -> None: + attrs = _pool_attrs(pool_name) + attrs["ydb.query.session.state"] = state + _metrics_registry.add_query_session_count(delta, attrs) + + +def record_query_session_create_time(duration: float, pool_name: Optional[str]) -> None: + _metrics_registry.record(QUERY_SESSION_CREATE_TIME, duration, _pool_attrs(pool_name)) + + +def record_query_session_pending_requests(delta: int, pool_name: Optional[str]) -> None: + _metrics_registry.add(QUERY_SESSION_PENDING_REQUESTS, delta, _pool_attrs(pool_name)) + + +def record_query_session_timeout(pool_name: Optional[str]) -> None: + _metrics_registry.add(QUERY_SESSION_TIMEOUTS, 1, _pool_attrs(pool_name)) diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 1d0995df8..f78d7e2b1 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -3,6 +3,8 @@ import enum from typing import Optional, Tuple +from ydb.opentelemetry.metrics import create_metrics_operation + class SpanName(str, enum.Enum): """Canonical span names used across the YDB SDK.""" @@ -16,7 +18,6 @@ class SpanName(str, enum.Enum): RUN_WITH_RETRY = "ydb.RunWithRetry" TRY = "ydb.Try" - class _NoopCtx: __slots__ = ("_span",) @@ -135,9 +136,9 @@ def create_span(name, attributes=None, kind="internal"): def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): """Create a span pre-filled with standard YDB attributes.""" - if not _registry.is_active(): - return _NOOP_SPAN attrs = _build_ydb_attrs(driver_config, node_id, peer) + if not _registry.is_active(): + return create_metrics_operation(name, attrs) return _registry.create_span(name, attributes=attrs, kind=kind) diff --git a/ydb/query/pool.py b/ydb/query/pool.py index 44d4d34af..e17a451b7 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -14,6 +14,7 @@ import time import threading import queue +import itertools from .base import BaseQueryTxMode, QueryExplainResultFormat from .base import QueryClientSettings @@ -27,6 +28,12 @@ from .. import issues from .. import convert from ..settings import BaseRequestSettings +from ..opentelemetry.metrics import ( + record_query_session_count, + record_query_session_create_time, + record_query_session_pending_requests, + record_query_session_timeout, +) from .._grpc.grpcwrapper import ydb_query_public_types as _ydb_query_public if TYPE_CHECKING: @@ -34,6 +41,8 @@ logger = logging.getLogger(__name__) +_pool_name_counter = itertools.count(1) + class QuerySessionPool: """QuerySessionPool is an object to simplify operations with sessions of Query Service.""" @@ -47,12 +56,14 @@ def __init__( *, query_client_settings: Optional[QueryClientSettings] = None, workers_threads_count: int = 4, + name: Optional[str] = None, ): """ :param driver: A driver instance. :param size: Max size of Session Pool. :param query_client_settings: ydb.QueryClientSettings object to configure QueryService behavior :param workers_threads_count: A number of threads in executor used for ``*_async`` methods + :param name: Optional session pool name for OpenTelemetry metrics. """ self._driver = driver @@ -63,10 +74,15 @@ def __init__( self._should_stop = threading.Event() self._lock = threading.RLock() self._query_client_settings = query_client_settings + self._metrics_pool_name = name or "query-session-pool-%d" % next(_pool_name_counter) def _create_new_session(self, timeout: Optional[float]): session = QuerySession(self._driver, settings=self._query_client_settings) + session._metrics_pool_name = self._metrics_pool_name + session._metrics_state = "used" + start_time = time.monotonic() session.create(settings=BaseRequestSettings().with_timeout(timeout)) + record_query_session_create_time(time.monotonic() - start_time, self._metrics_pool_name) logger.debug(f"New session was created for pool. Session id: {session.session_id}") return session @@ -95,17 +111,24 @@ def acquire(self, timeout: Optional[float] = None) -> QuerySession: pass finish = time.monotonic() - timeout = timeout - (finish - start) if timeout is not None else None + timeout = max(0, timeout - (finish - start)) if timeout is not None else None start = time.monotonic() if session is None and self._current_size == self._size: + record_query_session_pending_requests(1, self._metrics_pool_name) try: session = self._queue.get(block=True, timeout=timeout) except queue.Empty: + record_query_session_timeout(self._metrics_pool_name) raise issues.SessionPoolEmpty("Timeout on acquire session") + finally: + record_query_session_pending_requests(-1, self._metrics_pool_name) if session is not None: if session.is_active: + record_query_session_count(-1, self._metrics_pool_name, "idle") + session._metrics_state = "used" + record_query_session_count(1, self._metrics_pool_name, "used") logger.debug(f"Acquired active session from queue: {session.session_id}") return session else: @@ -114,7 +137,7 @@ def acquire(self, timeout: Optional[float] = None) -> QuerySession: logger.debug(f"Session pool is not large enough: {self._current_size} < {self._size}, will create new one.") finish = time.monotonic() - time_left = timeout - (finish - start) if timeout is not None else None + time_left = max(0, timeout - (finish - start)) if timeout is not None else None session = self._create_new_session(time_left) self._current_size += 1 @@ -125,6 +148,9 @@ def acquire(self, timeout: Optional[float] = None) -> QuerySession: def release(self, session: QuerySession) -> None: """Release a session back to Session Pool.""" + record_query_session_count(-1, self._metrics_pool_name, "used") + session._metrics_state = "idle" + record_query_session_count(1, self._metrics_pool_name, "idle") self._queue.put_nowait(session) logger.debug("Session returned to queue: %s", session.session_id) diff --git a/ydb/query/session.py b/ydb/query/session.py index a9c1b4a50..28dc80c06 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -19,6 +19,8 @@ from .. import _apis, issues, _utilities from ..opentelemetry.tracing import SpanName, create_ydb_span, set_peer_attributes, span_finish_callback +from ..opentelemetry.metrics import record_query_session_count +from ..opentelemetry.tracing import create_ydb_span, set_peer_attributes from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -94,6 +96,7 @@ class BaseQuerySession(abc.ABC, Generic[DriverT]): _peer: Optional[tuple] = None _closed: bool = False _invalidated: bool = False + _metrics_counted: bool = False def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] = None): self._driver = driver @@ -106,6 +109,7 @@ def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] ) self._last_query_stats = None + self._metrics_counted = False @property def _driver_config(self) -> Optional["DriverConfig"]: @@ -159,6 +163,13 @@ def _check_session_ready_to_use(self) -> None: def _close_session(self, invalidate: bool = False) -> None: if self._closed: return + if self._metrics_counted: + record_query_session_count( + -1, + pool_name=getattr(self, "_metrics_pool_name", None), + state=getattr(self, "_metrics_state", "used"), + ) + self._metrics_counted = False if invalidate: self._invalidated = True self._closed = True @@ -418,10 +429,17 @@ def create(self, settings: Optional[BaseRequestSettings] = None) -> "QuerySessio if self._closed: raise RuntimeError("Session is already closed.") - with create_ydb_span(SpanName.CREATE_SESSION, self._driver_config).attach_context() as span: + with create_ydb_span("ydb.CreateSession", self._driver_config).attach_context() as span: self._create_call(settings=settings) set_peer_attributes(span, self._peer) self._attach() + if not getattr(self, "_metrics_counted", False): + record_query_session_count( + 1, + pool_name=getattr(self, "_metrics_pool_name", None), + state=getattr(self, "_metrics_state", "used"), + ) + self._metrics_counted = True return self From 6d2b3fc370f769383e0cbfdf5ec3c46190bfbdb9 Mon Sep 17 00:00:00 2001 From: tewbo Date: Wed, 13 May 2026 01:11:45 +0300 Subject: [PATCH 02/30] refactoring --- ydb/opentelemetry/__init__.py | 18 +++- ydb/opentelemetry/_plugin.py | 190 ---------------------------------- ydb/opentelemetry/plugin.py | 61 +++++++++-- 3 files changed, 64 insertions(+), 205 deletions(-) delete mode 100644 ydb/opentelemetry/_plugin.py diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index a6d379d1d..e29a577d6 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -13,7 +13,7 @@ def enable_tracing(tracer=None): ``ydb.sdk`` from the global tracer provider will be used. """ try: - from ydb.opentelemetry._plugin import _enable_tracing + from ydb.opentelemetry.plugin import _enable_tracing except ImportError: raise ImportError( "OpenTelemetry packages are required for tracing support. " @@ -26,7 +26,7 @@ def enable_tracing(tracer=None): def disable_tracing(): """Disable YDB OpenTelemetry hooks and allow :func:`enable_tracing` to run again.""" try: - from ydb.opentelemetry._plugin import _disable_tracing + from ydb.opentelemetry.plugin import _disable_tracing except ImportError: return @@ -41,7 +41,7 @@ def enable_registry(meter_provider=None): the global OpenTelemetry meter provider is used. """ try: - from ydb.opentelemetry._plugin import _enable_metrics + from ydb.opentelemetry.plugin import _enable_metrics except ImportError: raise ImportError( "OpenTelemetry packages are required for metrics support. " @@ -51,4 +51,14 @@ def enable_registry(meter_provider=None): _enable_metrics(meter_provider) -__all__ = ["disable_tracing", "enable_registry", "enable_tracing"] +def disable_registry(): + """Disable YDB OpenTelemetry metrics collection and allow :func:`enable_registry` to run again.""" + try: + from ydb.opentelemetry.plugin import _disable_metrics + except ImportError: + return + + _disable_metrics() + + +__all__ = ["disable_tracing", "enable_tracing", "disable_registry", "enable_registry"] diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py deleted file mode 100644 index c7485f5f2..000000000 --- a/ydb/opentelemetry/_plugin.py +++ /dev/null @@ -1,190 +0,0 @@ -"""OpenTelemetry bridge for YDB.""" - -from opentelemetry import context as otel_context -from opentelemetry import metrics -from opentelemetry import trace -from opentelemetry.propagate import inject -from opentelemetry.trace import StatusCode -from opentelemetry.metrics import Observation - -from ydb import issues -from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry.metrics import _metrics_registry, create_metrics_operation -from ydb.opentelemetry.tracing import _registry as _tracing_registry - -# YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. -_TRANSPORT_STATUSES = frozenset( - { - YdbStatusCode.CONNECTION_LOST, - YdbStatusCode.CONNECTION_FAILURE, - YdbStatusCode.DEADLINE_EXCEEDED, - YdbStatusCode.CLIENT_INTERNAL_ERROR, - YdbStatusCode.UNIMPLEMENTED, - } -) - -_tracer = None -_meter = None -_enabled = False - -_KIND_MAP = { - "client": trace.SpanKind.CLIENT, - "internal": trace.SpanKind.INTERNAL, -} - - -def _otel_metadata_hook(): - """Inject W3C Trace Context into outgoing gRPC metadata using the active OTel context.""" - headers = {} - inject(headers) - return list(headers.items()) - - -def _set_error_on_span(span, exception): - if isinstance(exception, issues.Error) and exception.status is not None: - span.set_attribute("db.response.status_code", exception.status.name) - error_type = "transport_error" if exception.status in _TRANSPORT_STATUSES else "ydb_error" - else: - error_type = type(exception).__qualname__ - - span.set_attribute("error.type", error_type) - span.set_status(StatusCode.ERROR, str(exception)) - span.record_exception(exception) - - -class _AttachContext: - """Make a span the active OTel context for a ``with`` block, without ending it. - - Used around the initial gRPC call of a streaming RPC: the span outlives the - ``with`` block — the result iterator owns ``end()``. For non-streaming RPCs - use ``with create_ydb_span(...)`` directly. - """ - - def __init__(self, raw_span): - self._raw = raw_span - self._token = None - - def __enter__(self): - ctx = trace.set_span_in_context(self._raw) - self._token = otel_context.attach(ctx) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self._token is not None: - otel_context.detach(self._token) - self._token = None - return False - - -class TracingSpan: - """Wrapper around an OTel span. - - As context manager: ``__enter__`` attaches the OTel context (so child spans - nest correctly and ``inject()`` sees this span when building gRPC metadata) - and ``__exit__`` detaches and ends the span. Used by Commit / Rollback / - RunWithRetry / Try and similar single-shot operations. - - For ExecuteQuery streams the span outlives the ``with`` block: call - :meth:`attach_context` around the initial gRPC call only, and let the result - iterator own ``end()``. - """ - - def __init__(self, span, name, attributes): - self._span = span - self._otel_context_token = None - self._metrics_operation = create_metrics_operation(name, attributes) - - def set_error(self, exception): - _set_error_on_span(self._span, exception) - self._metrics_operation.set_error(exception) - - def set_attribute(self, key, value): - self._span.set_attribute(key, value) - self._metrics_operation.set_attribute(key, value) - - def end(self): - self._span.end() - self._metrics_operation.end() - - def attach_context(self): - return _AttachContext(self._span) - - def __enter__(self): - ctx = trace.set_span_in_context(self._span) - self._otel_context_token = otel_context.attach(ctx) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self._otel_context_token is not None: - otel_context.detach(self._otel_context_token) - self._otel_context_token = None - if exc_val is not None: - self.set_error(exc_val) - self.end() - return False - - -def _create_span(name, attributes=None, kind=None): - attrs = attributes or {} - span = _tracer.start_span( - name, - kind=_KIND_MAP.get(kind, trace.SpanKind.CLIENT), - attributes=attrs, - ) - return TracingSpan(span, name, attrs) - - -def _enable_tracing(tracer=None): - global _enabled, _tracer - - if _enabled: - return - - _tracer = tracer if tracer is not None else trace.get_tracer("ydb.sdk") - _enabled = True - _tracing_registry.set_metadata_hook(_otel_metadata_hook) - _tracing_registry.set_create_span(_create_span) - - -def _disable_tracing(): - """Clear hooks and tracer; after this, :func:`~ydb.opentelemetry.enable_tracing` may be called again.""" - global _enabled, _tracer - - _tracing_registry.set_create_span(None) - _tracing_registry.set_metadata_hook(None) - _enabled = False - _tracer = None - - -def _create_query_session_count_callback(): - """Create callback for observable query session count metric.""" - - def observe_query_session_count(_): - values = _metrics_registry.get_query_session_count_values() - return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] - - return observe_query_session_count - - -def _enable_metrics(meter_provider): - global _meter - - if _meter is not None: - return - - if meter_provider is None: - _meter = metrics.get_meter("ydb.sdk") - elif hasattr(meter_provider, "get_meter"): - _meter = meter_provider.get_meter("ydb.sdk") - else: - raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") - - _metrics_registry.set_meter(_meter, _create_query_session_count_callback()) - - -def _disable_metrics(): - global _meter - - _metrics_registry.clear() - if _meter is not None: - _meter = None diff --git a/ydb/opentelemetry/plugin.py b/ydb/opentelemetry/plugin.py index 76942789f..5dd9c996d 100644 --- a/ydb/opentelemetry/plugin.py +++ b/ydb/opentelemetry/plugin.py @@ -2,12 +2,17 @@ from opentelemetry import context as otel_context from opentelemetry import trace +from opentelemetry.metrics import Observation +from opentelemetry import metrics from opentelemetry.propagate import inject from opentelemetry.trace import StatusCode from ydb import issues from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry.tracing import _registry +from ydb.opentelemetry import metrics + +from ydb.opentelemetry.metrics import _metrics_registry, create_metrics_operation +from ydb.opentelemetry.tracing import _registry as _tracing_registry # YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. _TRANSPORT_STATUSES = frozenset( @@ -21,7 +26,8 @@ ) _tracer = None -_enabled = False +_tracing_enabled = False +_meter = None _KIND_MAP = { "client": trace.SpanKind.CLIENT, @@ -113,22 +119,55 @@ def _create_span(name, attributes=None, kind=None): def _enable_tracing(tracer=None): - global _enabled, _tracer + global _tracing_enabled, _tracer - if _enabled: + if _tracing_enabled: return _tracer = tracer if tracer is not None else trace.get_tracer("ydb.sdk") - _enabled = True - _registry.set_metadata_hook(_otel_metadata_hook) - _registry.set_create_span(_create_span) + _tracing_enabled = True + _tracing_registry.set_metadata_hook(_otel_metadata_hook) + _tracing_registry.set_create_span(_create_span) def _disable_tracing(): """Clear hooks and tracer; after this, :func:`~ydb.opentelemetry.enable_tracing` may be called again.""" - global _enabled, _tracer + global _tracing_enabled, _tracer - _registry.set_create_span(None) - _registry.set_metadata_hook(None) - _enabled = False + _tracing_registry.set_create_span(None) + _tracing_registry.set_metadata_hook(None) + _tracing_enabled = False _tracer = None + +def _create_query_session_count_callback(): + """Create callback for observable query session count metric.""" + + def observe_query_session_count(_): + values = _metrics_registry.get_query_session_count_values() + return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] + + return observe_query_session_count + + +def _enable_metrics(meter_provider): + global _meter + + if _meter is not None: + return + + if meter_provider is None: + _meter = metrics.get_meter("ydb.sdk") + elif hasattr(meter_provider, "get_meter"): + _meter = meter_provider.get_meter("ydb.sdk") + else: + raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") + + _metrics_registry.set_meter(_meter, _create_query_session_count_callback()) + + +def _disable_metrics(): + global _meter + + _metrics_registry.clear() + if _meter is not None: + _meter = None From f8f1725dbed82ba46e49110caaa156ab51c204ee Mon Sep 17 00:00:00 2001 From: tewbo Date: Wed, 13 May 2026 16:17:16 +0300 Subject: [PATCH 03/30] + add retry metrics --- ydb/opentelemetry/metrics.py | 23 +++++++++++++++ ydb/retries.py | 57 ++++++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index e4131a558..86079969f 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -9,6 +9,8 @@ QUERY_SESSION_CREATE_TIME = "ydb.query.session.create_time" QUERY_SESSION_PENDING_REQUESTS = "ydb.query.session.pending_requests" QUERY_SESSION_TIMEOUTS = "ydb.query.session.timeouts" +RETRY_ATTEMPTS = "ydb.client.retry.attempts" +RETRY_DURATION = "ydb.client.retry.duration" _UNKNOWN_POOL = "unknown" @@ -55,6 +57,22 @@ def set_meter(self, meter: Any, observe_query_session_count_callback: Any) -> No unit="{connection}", description="Number of YDB query session acquisition timeouts.", ), + RETRY_DURATION: meter.create_histogram( + RETRY_DURATION, + unit="s", + description=( + "Total user-visible duration of a logical operation executed through the retry policy, " + "including all attempts and back-off delays." + ), + ), + RETRY_ATTEMPTS: meter.create_histogram( + RETRY_ATTEMPTS, + unit="{attempt}", + description=( + "Total number of attempts performed by the retry policy for one logical operation. " + "A value of 1 means the operation succeeded on the first try." + ), + ), } def clear(self) -> None: @@ -218,3 +236,8 @@ def record_query_session_pending_requests(delta: int, pool_name: Optional[str]) def record_query_session_timeout(pool_name: Optional[str]) -> None: _metrics_registry.add(QUERY_SESSION_TIMEOUTS, 1, _pool_attrs(pool_name)) + + +def record_retry_metrics(duration: float, attempts: int) -> None: + _metrics_registry.record(RETRY_DURATION, duration) + _metrics_registry.record(RETRY_ATTEMPTS, attempts) diff --git a/ydb/retries.py b/ydb/retries.py index 4b7c137f3..5765d50e7 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -7,6 +7,7 @@ from . import issues from ._errors import check_retriable_error +from .opentelemetry.metrics import record_retry_metrics from .opentelemetry.tracing import SpanName, create_span as _create_span @@ -164,21 +165,28 @@ def retry_operation_sync( **kwargs: Any, ) -> Any: backoff_ms: Optional[int] = None + attempts = 0 + start_time = time.monotonic() @functools.wraps(callee) def traced_callee(*a: Any, **kw: Any) -> Any: + nonlocal attempts + attempts += 1 with _create_span(SpanName.TRY, _try_span_attrs(backoff_ms)): return callee(*a, **kw) - with _create_span(SpanName.RUN_WITH_RETRY): - for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): - if isinstance(next_opt, YdbRetryOperationSleepOpt): - backoff_ms = int(next_opt.timeout * 1000) - if next_opt.timeout > 0: - time.sleep(next_opt.timeout) - else: - return next_opt.result - return None + try: + with _create_span(SpanName.RUN_WITH_RETRY): + for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): + if isinstance(next_opt, YdbRetryOperationSleepOpt): + backoff_ms = int(next_opt.timeout * 1000) + if next_opt.timeout > 0: + time.sleep(next_opt.timeout) + else: + return next_opt.result + return None + finally: + record_retry_metrics(time.monotonic() - start_time, attempts) async def retry_operation_async( # pylint: disable=W1113 @@ -200,20 +208,31 @@ async def retry_operation_async( # pylint: disable=W1113 Returns awaitable result of coroutine. If retries are not succussful exception is raised. """ backoff_ms: Optional[int] = None - with _create_span(SpanName.RUN_WITH_RETRY): - for next_opt in retry_operation_impl(callee, retry_settings, *args, **kwargs): - if isinstance(next_opt, YdbRetryOperationSleepOpt): - backoff_ms = int(next_opt.timeout * 1000) - if next_opt.timeout > 0: - await asyncio.sleep(next_opt.timeout) - else: - with _create_span(SpanName.TRY, _try_span_attrs(backoff_ms)) as try_span: + attempts = 0 + start_time = time.monotonic() + + @functools.wraps(callee) + async def traced_callee(*a: Any, **kw: Any) -> Any: + nonlocal attempts + attempts += 1 + with _create_span(SpanName.TRY, _try_span_attrs(backoff_ms)): + return await callee(*a, **kw) + + try: + with _create_span(SpanName.RUN_WITH_RETRY): + for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): + if isinstance(next_opt, YdbRetryOperationSleepOpt): + backoff_ms = int(next_opt.timeout * 1000) + if next_opt.timeout > 0: + await asyncio.sleep(next_opt.timeout) + else: try: return await next_opt.result except BaseException as e: # pylint: disable=W0703 - try_span.set_error(e) next_opt.set_exception(e) - return None + return None + finally: + record_retry_metrics(time.monotonic() - start_time, attempts) def ydb_retry( From 15b43bf4ac02e1cf58bbf08bacfb33d944312b1d Mon Sep 17 00:00:00 2001 From: tewbo Date: Thu, 14 May 2026 11:14:32 +0300 Subject: [PATCH 04/30] refactoring --- tests/{tracing => opentelemetry}/__init__.py | 0 tests/{tracing => opentelemetry}/conftest.py | 0 tests/{tracing => opentelemetry}/test_tracing_async.py | 0 tests/{tracing => opentelemetry}/test_tracing_sync.py | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) rename tests/{tracing => opentelemetry}/__init__.py (100%) rename tests/{tracing => opentelemetry}/conftest.py (100%) rename tests/{tracing => opentelemetry}/test_tracing_async.py (100%) rename tests/{tracing => opentelemetry}/test_tracing_sync.py (99%) diff --git a/tests/tracing/__init__.py b/tests/opentelemetry/__init__.py similarity index 100% rename from tests/tracing/__init__.py rename to tests/opentelemetry/__init__.py diff --git a/tests/tracing/conftest.py b/tests/opentelemetry/conftest.py similarity index 100% rename from tests/tracing/conftest.py rename to tests/opentelemetry/conftest.py diff --git a/tests/tracing/test_tracing_async.py b/tests/opentelemetry/test_tracing_async.py similarity index 100% rename from tests/tracing/test_tracing_async.py rename to tests/opentelemetry/test_tracing_async.py diff --git a/tests/tracing/test_tracing_sync.py b/tests/opentelemetry/test_tracing_sync.py similarity index 99% rename from tests/tracing/test_tracing_sync.py rename to tests/opentelemetry/test_tracing_sync.py index 9f8bbc421..38e5b8ce0 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/opentelemetry/test_tracing_sync.py @@ -314,7 +314,7 @@ class TestNoSpansWhenDisabled: def test_no_spans_without_enable_tracing(self): """Without enable_tracing(), the registry uses noop — no spans are created.""" - from tests.tracing.conftest import _exporter + from tests.opentelemetry.conftest import _exporter _registry.set_create_span(None) _registry.set_metadata_hook(None) From 9ab11b30077febe58a0660204b9ec95cdf8c3187 Mon Sep 17 00:00:00 2001 From: tewbo Date: Thu, 14 May 2026 11:14:50 +0300 Subject: [PATCH 05/30] add metrics test --- tests/opentelemetry/test_metrics.py | 287 ++++++++++++++++++++++++++++ ydb/aio/query/pool_test.py | 1 + 2 files changed, 288 insertions(+) create mode 100644 tests/opentelemetry/test_metrics.py diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py new file mode 100644 index 000000000..0f52fe324 --- /dev/null +++ b/tests/opentelemetry/test_metrics.py @@ -0,0 +1,287 @@ +from unittest.mock import MagicMock + +import pytest +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import InMemoryMetricReader + + +@pytest.fixture() +def metrics_reader(): + from ydb.opentelemetry.metrics import _metrics_registry + from ydb.opentelemetry.plugin import _create_query_session_count_callback + + reader = InMemoryMetricReader() + provider = MeterProvider(metric_readers=[reader]) + meter = provider.get_meter("ydb.sdk") + + _metrics_registry.set_meter(meter, _create_query_session_count_callback()) + try: + yield reader + finally: + _metrics_registry.clear() + provider.shutdown() + + +def _metrics_by_name(reader): + data = reader.get_metrics_data() + if data is None: + return {} + + return { + metric.name: metric + for resource_metrics in data.resource_metrics + for scope_metrics in resource_metrics.scope_metrics + for metric in scope_metrics.metrics + } + + +def _single_point(reader, name): + metric = _metrics_by_name(reader)[name] + points = list(metric.data.data_points) + assert len(points) == 1 + return points[0] + + +def _histogram_sum(reader, name): + return _single_point(reader, name).sum + + +def _sum_value(reader, name): + return _single_point(reader, name).value + + +def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): + from ydb import issues + from ydb.opentelemetry.metrics import ( + CLIENT_OPERATION_DURATION, + CLIENT_OPERATION_FAILED, + QUERY_SESSION_COUNT, + QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_PENDING_REQUESTS, + QUERY_SESSION_TIMEOUTS, + RETRY_ATTEMPTS, + RETRY_DURATION, + create_metrics_operation, + record_query_session_count, + record_query_session_create_time, + record_query_session_pending_requests, + record_query_session_timeout, + record_retry_metrics, + ) + + monkeypatch.setattr("ydb.opentelemetry.metrics.time.monotonic", MagicMock(side_effect=[1.0, 1.25])) + + with pytest.raises(issues.Unavailable): + with create_metrics_operation("ExecuteQuery"): + raise issues.Unavailable("transient") + + record_query_session_count(2, "main", "used") + record_query_session_create_time(0.5, "main") + record_query_session_pending_requests(1, "main") + record_query_session_timeout("main") + record_retry_metrics(0.75, 3) + + metrics = _metrics_by_name(metrics_reader) + + assert set(metrics) == { + CLIENT_OPERATION_DURATION, + CLIENT_OPERATION_FAILED, + QUERY_SESSION_COUNT, + QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_PENDING_REQUESTS, + QUERY_SESSION_TIMEOUTS, + RETRY_ATTEMPTS, + RETRY_DURATION, + } + assert metrics[CLIENT_OPERATION_DURATION].unit == "s" + assert metrics[CLIENT_OPERATION_FAILED].unit == "{command}" + assert metrics[QUERY_SESSION_COUNT].unit == "{connection}" + assert metrics[QUERY_SESSION_CREATE_TIME].unit == "s" + assert metrics[QUERY_SESSION_PENDING_REQUESTS].unit == "{request}" + assert metrics[QUERY_SESSION_TIMEOUTS].unit == "{connection}" + assert metrics[RETRY_DURATION].unit == "s" + assert metrics[RETRY_ATTEMPTS].unit == "{attempt}" + + +def test_metrics_registry_is_noop_without_meter(): + from ydb.opentelemetry.metrics import ( + _metrics_registry, + create_metrics_operation, + record_query_session_create_time, + record_query_session_pending_requests, + record_query_session_timeout, + record_retry_metrics, + ) + + _metrics_registry.clear() + + record_query_session_create_time(1.0, "pool") + record_query_session_pending_requests(1, "pool") + record_query_session_timeout("pool") + record_retry_metrics(1.0, 2) + + with create_metrics_operation("test.operation"): + pass + + +def test_metrics_operation_records_duration_once(metrics_reader, monkeypatch): + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation + + monotonic = MagicMock(side_effect=[10.0, 10.25, 11.0]) + monkeypatch.setattr("ydb.opentelemetry.metrics.time.monotonic", monotonic) + + operation = create_metrics_operation( + "ExecuteQuery", + { + "db.namespace": "/Root/test", + "server.address": "localhost", + "server.port": 2136, + }, + ) + operation.end() + operation.end() + + point = _single_point(metrics_reader, CLIENT_OPERATION_DURATION) + + assert point.sum == 0.25 + assert point.count == 1 + assert point.attributes == { + "db.system.name": "ydb", + "db.namespace": "/Root/test", + "server.address": "localhost", + "server.port": 2136, + "ydb.operation.name": "ExecuteQuery", + } + + +def test_metrics_operation_records_ydb_error(metrics_reader, monkeypatch): + from ydb import issues + from ydb.opentelemetry.metrics import CLIENT_OPERATION_FAILED, create_metrics_operation + + monkeypatch.setattr("ydb.opentelemetry.metrics.time.monotonic", MagicMock(side_effect=[1.0, 1.1])) + + with pytest.raises(issues.Unavailable): + with create_metrics_operation("ExecuteQuery"): + raise issues.Unavailable("transient") + + point = _single_point(metrics_reader, CLIENT_OPERATION_FAILED) + + assert point.value == 1 + assert point.attributes["db.response.status_code"] == "UNAVAILABLE" + assert point.attributes["ydb.operation.name"] == "ExecuteQuery" + + +def test_metrics_operation_records_generic_error_status_code(metrics_reader): + from ydb.opentelemetry.metrics import CLIENT_OPERATION_FAILED, create_metrics_operation + + with pytest.raises(ValueError): + with create_metrics_operation("ExecuteQuery"): + raise ValueError("bad value") + + assert _single_point(metrics_reader, CLIENT_OPERATION_FAILED).attributes["db.response.status_code"] == "ValueError" + + +def test_metrics_operation_set_attribute(metrics_reader): + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation + + operation = create_metrics_operation("ExecuteQuery") + operation.set_attribute("ydb.node.id", 123) + operation.end() + + assert _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes["ydb.node.id"] == 123 + + +def test_query_session_count_accumulates_by_attributes(metrics_reader): + from ydb.opentelemetry.metrics import QUERY_SESSION_COUNT, record_query_session_count + + record_query_session_count(1, "main", "used") + record_query_session_count(2, "main", "used") + record_query_session_count(1, None, "idle") + + metric = _metrics_by_name(metrics_reader)[QUERY_SESSION_COUNT] + values = {tuple(sorted(point.attributes.items())): point.value for point in metric.data.data_points} + + assert ( + values[ + ( + ("ydb.query.session.pool.name", "main"), + ("ydb.query.session.state", "used"), + ) + ] + == 3 + ) + assert ( + values[ + ( + ("ydb.query.session.pool.name", "unknown"), + ("ydb.query.session.state", "idle"), + ) + ] + == 1 + ) + + +def test_query_session_helpers_record_pool_attributes(metrics_reader): + from ydb.opentelemetry.metrics import ( + QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_PENDING_REQUESTS, + QUERY_SESSION_TIMEOUTS, + record_query_session_create_time, + record_query_session_pending_requests, + record_query_session_timeout, + ) + + record_query_session_create_time(0.5, "main") + record_query_session_pending_requests(1, None) + record_query_session_timeout("main") + + assert _histogram_sum(metrics_reader, QUERY_SESSION_CREATE_TIME) == 0.5 + assert _single_point(metrics_reader, QUERY_SESSION_CREATE_TIME).attributes == { + "ydb.query.session.pool.name": "main" + } + assert _sum_value(metrics_reader, QUERY_SESSION_PENDING_REQUESTS) == 1 + assert _single_point(metrics_reader, QUERY_SESSION_PENDING_REQUESTS).attributes == { + "ydb.query.session.pool.name": "unknown" + } + assert _sum_value(metrics_reader, QUERY_SESSION_TIMEOUTS) == 1 + assert _single_point(metrics_reader, QUERY_SESSION_TIMEOUTS).attributes == {"ydb.query.session.pool.name": "main"} + + +def test_retry_operation_sync_records_retry_metrics(metrics_reader): + from ydb import issues + from ydb.opentelemetry.metrics import RETRY_ATTEMPTS, RETRY_DURATION + from ydb.retries import RetrySettings, retry_operation_sync + + attempts = {"count": 0} + + def flaky(): + attempts["count"] += 1 + if attempts["count"] < 3: + raise issues.Aborted("retry") + return "ok" + + assert retry_operation_sync(flaky, RetrySettings(max_retries=5)) == "ok" + + duration = _single_point(metrics_reader, RETRY_DURATION) + assert duration.count == 1 + assert duration.sum >= 0 + assert duration.attributes == {} + assert _histogram_sum(metrics_reader, RETRY_ATTEMPTS) == 3 + + +async def _async_value(): + return "ok" + + +@pytest.mark.asyncio +async def test_retry_operation_async_records_retry_metrics(metrics_reader): + from ydb.opentelemetry.metrics import RETRY_ATTEMPTS, RETRY_DURATION + from ydb.retries import retry_operation_async + + assert await retry_operation_async(_async_value) == "ok" + + duration = _single_point(metrics_reader, RETRY_DURATION) + assert duration.count == 1 + assert duration.sum >= 0 + assert duration.attributes == {} + assert _histogram_sum(metrics_reader, RETRY_ATTEMPTS) == 1 diff --git a/ydb/aio/query/pool_test.py b/ydb/aio/query/pool_test.py index de33a8e02..ad62f30e1 100644 --- a/ydb/aio/query/pool_test.py +++ b/ydb/aio/query/pool_test.py @@ -19,6 +19,7 @@ def _make_pool(size=1): pool._current_size = 0 pool._loop = asyncio.get_event_loop() pool._query_client_settings = None + pool._metrics_pool_name = "test-query-session-pool" return pool From 709ca05742cdabca1f653dbc8a8aa7bea26eaaf1 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 15 May 2026 01:48:29 +0300 Subject: [PATCH 06/30] add metric and refactor --- tests/opentelemetry/test_metrics.py | 158 +++++++++++++++++++++++++++- ydb/aio/query/pool.py | 2 + ydb/opentelemetry/metrics.py | 77 +++++++++++--- ydb/opentelemetry/plugin.py | 21 +++- ydb/opentelemetry/tracing.py | 71 +++++++++++-- ydb/query/pool.py | 2 + 6 files changed, 304 insertions(+), 27 deletions(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 0f52fe324..e0a57e948 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -14,7 +14,13 @@ def metrics_reader(): provider = MeterProvider(metric_readers=[reader]) meter = provider.get_meter("ydb.sdk") - _metrics_registry.set_meter(meter, _create_query_session_count_callback()) + from ydb.opentelemetry.plugin import _create_query_session_max_callback + + _metrics_registry.set_meter( + meter, + _create_query_session_count_callback(), + _create_query_session_max_callback(), + ) try: yield reader finally: @@ -57,6 +63,7 @@ def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): CLIENT_OPERATION_FAILED, QUERY_SESSION_COUNT, QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_MAX, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, @@ -64,6 +71,7 @@ def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): create_metrics_operation, record_query_session_count, record_query_session_create_time, + record_query_session_max, record_query_session_pending_requests, record_query_session_timeout, record_retry_metrics, @@ -77,6 +85,7 @@ def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): record_query_session_count(2, "main", "used") record_query_session_create_time(0.5, "main") + record_query_session_max(100, "main") record_query_session_pending_requests(1, "main") record_query_session_timeout("main") record_retry_metrics(0.75, 3) @@ -88,6 +97,7 @@ def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): CLIENT_OPERATION_FAILED, QUERY_SESSION_COUNT, QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_MAX, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, @@ -97,6 +107,7 @@ def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): assert metrics[CLIENT_OPERATION_FAILED].unit == "{command}" assert metrics[QUERY_SESSION_COUNT].unit == "{connection}" assert metrics[QUERY_SESSION_CREATE_TIME].unit == "s" + assert metrics[QUERY_SESSION_MAX].unit == "{connection}" assert metrics[QUERY_SESSION_PENDING_REQUESTS].unit == "{request}" assert metrics[QUERY_SESSION_TIMEOUTS].unit == "{connection}" assert metrics[RETRY_DURATION].unit == "s" @@ -185,10 +196,127 @@ def test_metrics_operation_set_attribute(metrics_reader): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation operation = create_metrics_operation("ExecuteQuery") + operation.set_attribute("db.namespace", "/Root/test") + operation.end() + + assert _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes["db.namespace"] == "/Root/test" + + +def test_metrics_operation_ignores_non_metric_attributes(metrics_reader): + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation + + operation = create_metrics_operation("ExecuteQuery") + operation.set_attribute("network.peer.address", "node.example.net") + operation.set_attribute("network.peer.port", 2136) + operation.set_attribute("ydb.node.dc", "dc-a") operation.set_attribute("ydb.node.id", 123) operation.end() - assert _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes["ydb.node.id"] == 123 + attrs = _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes + + assert "network.peer.address" not in attrs + assert "network.peer.port" not in attrs + assert "ydb.node.dc" not in attrs + assert "ydb.node.id" not in attrs + + +def test_metrics_operation_respects_end_on_exit_false(metrics_reader): + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation + + operation = create_metrics_operation("ExecuteQuery") + with operation.attach_context(end_on_exit=False): + pass + + assert CLIENT_OPERATION_DURATION not in _metrics_by_name(metrics_reader) + + operation.end() + + point = _single_point(metrics_reader, CLIENT_OPERATION_DURATION) + assert point.count == 1 + assert point.sum >= 0 + + +def test_create_ydb_span_records_metrics_when_tracing_is_active(metrics_reader): + from tests.opentelemetry.conftest import FakeDriverConfig + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION + from ydb.opentelemetry.tracing import _registry, create_ydb_span + + class FakeSpan: + def __init__(self): + self.ended = False + self.attributes = {} + self._end_on_exit = True + + def set_error(self, exception): + pass + + def set_attribute(self, key, value): + self.attributes[key] = value + + def end(self): + self.ended = True + + def attach_context(self, end_on_exit=True): + self._end_on_exit = end_on_exit + return self + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_val is not None or self._end_on_exit: + self.end() + return False + + created_spans = [] + + def create_span(name, attributes=None, kind=None): + span = FakeSpan() + span.attributes.update(attributes or {}) + created_spans.append(span) + return span + + try: + _registry.set_create_span(create_span) + + with create_ydb_span( + "ydb.ExecuteQuery", + FakeDriverConfig(), + node_id=123, + peer=("node.example.net", 2136, "dc-a"), + ).attach_context(): + pass + finally: + _registry.set_create_span(None) + + assert len(created_spans) == 1 + assert created_spans[0].ended is True + assert created_spans[0].attributes["network.peer.address"] == "node.example.net" + assert created_spans[0].attributes["network.peer.port"] == 2136 + assert created_spans[0].attributes["ydb.node.dc"] == "dc-a" + assert created_spans[0].attributes["ydb.node.id"] == 123 + + metric_attrs = _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes + assert metric_attrs["ydb.operation.name"] == "ydb.ExecuteQuery" + assert "network.peer.address" not in metric_attrs + assert "network.peer.port" not in metric_attrs + assert "ydb.node.dc" not in metric_attrs + assert "ydb.node.id" not in metric_attrs + + +def test_create_ydb_span_records_metrics_when_tracing_is_disabled(metrics_reader): + from tests.opentelemetry.conftest import FakeDriverConfig + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION + from ydb.opentelemetry.tracing import _registry, create_ydb_span + + _registry.set_create_span(None) + + with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig()).attach_context(): + pass + + assert ( + _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes["ydb.operation.name"] == "ydb.ExecuteQuery" + ) def test_query_session_count_accumulates_by_attributes(metrics_reader): @@ -224,14 +352,17 @@ def test_query_session_count_accumulates_by_attributes(metrics_reader): def test_query_session_helpers_record_pool_attributes(metrics_reader): from ydb.opentelemetry.metrics import ( QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_MAX, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, record_query_session_create_time, + record_query_session_max, record_query_session_pending_requests, record_query_session_timeout, ) record_query_session_create_time(0.5, "main") + record_query_session_max(100, "main") record_query_session_pending_requests(1, None) record_query_session_timeout("main") @@ -245,6 +376,29 @@ def test_query_session_helpers_record_pool_attributes(metrics_reader): } assert _sum_value(metrics_reader, QUERY_SESSION_TIMEOUTS) == 1 assert _single_point(metrics_reader, QUERY_SESSION_TIMEOUTS).attributes == {"ydb.query.session.pool.name": "main"} + assert _single_point(metrics_reader, QUERY_SESSION_MAX).value == 100 + assert _single_point(metrics_reader, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "main"} + + +def test_sync_query_session_pool_records_max(metrics_reader): + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + from ydb.query.pool import QuerySessionPool + + QuerySessionPool(driver=object(), size=42, name="sync-pool") + + assert _single_point(metrics_reader, QUERY_SESSION_MAX).value == 42 + assert _single_point(metrics_reader, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "sync-pool"} + + +@pytest.mark.asyncio +async def test_async_query_session_pool_records_max(metrics_reader): + from ydb.aio.query.pool import QuerySessionPool + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + + QuerySessionPool(driver=object(), size=24, name="async-pool") + + assert _single_point(metrics_reader, QUERY_SESSION_MAX).value == 24 + assert _single_point(metrics_reader, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "async-pool"} def test_retry_operation_sync_records_retry_metrics(metrics_reader): diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index 21ce110fc..f0ffb9e63 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -27,6 +27,7 @@ from ...opentelemetry.metrics import ( record_query_session_count, record_query_session_create_time, + record_query_session_max, record_query_session_pending_requests, record_query_session_timeout, ) @@ -65,6 +66,7 @@ def __init__( self._loop = asyncio.get_running_loop() if loop is None else loop self._query_client_settings = query_client_settings self._metrics_pool_name = name or "query-session-pool-%d" % next(_pool_name_counter) + record_query_session_max(self._size, self._metrics_pool_name) async def _create_new_session(self): session = QuerySession(self._driver, settings=self._query_client_settings) diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 86079969f..9d647942a 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -1,6 +1,7 @@ """No-op-safe helpers for YDB OpenTelemetry metrics.""" import time +import threading from typing import Any, Dict, Optional CLIENT_OPERATION_DURATION = "db.client.operation.duration" @@ -9,22 +10,35 @@ QUERY_SESSION_CREATE_TIME = "ydb.query.session.create_time" QUERY_SESSION_PENDING_REQUESTS = "ydb.query.session.pending_requests" QUERY_SESSION_TIMEOUTS = "ydb.query.session.timeouts" +QUERY_SESSION_MAX = "ydb.query.session.max" RETRY_ATTEMPTS = "ydb.client.retry.attempts" RETRY_DURATION = "ydb.client.retry.duration" _UNKNOWN_POOL = "unknown" - - -import threading +_OPERATION_ATTR_KEYS = frozenset( + { + "db.system.name", + "db.namespace", + "server.address", + "server.port", + "ydb.operation.name", + } +) class MetricsRegistry: def __init__(self) -> None: self._instruments: Dict[str, Any] = {} self._query_session_count_values: Dict[Any, int] = {} + self._query_session_max_values: Dict[Any, int] = {} self._query_session_count_lock = threading.Lock() - def set_meter(self, meter: Any, observe_query_session_count_callback: Any) -> None: + def set_meter( + self, + meter: Any, + observe_query_session_count_callback: Any, + observe_query_session_max_callback: Any, + ) -> None: self._instruments = { CLIENT_OPERATION_DURATION: meter.create_histogram( CLIENT_OPERATION_DURATION, @@ -57,6 +71,12 @@ def set_meter(self, meter: Any, observe_query_session_count_callback: Any) -> No unit="{connection}", description="Number of YDB query session acquisition timeouts.", ), + QUERY_SESSION_MAX: meter.create_observable_up_down_counter( + QUERY_SESSION_MAX, + callbacks=[observe_query_session_max_callback], + unit="{connection}", + description="Maximum configured number of YDB query sessions.", + ), RETRY_DURATION: meter.create_histogram( RETRY_DURATION, unit="s", @@ -79,6 +99,7 @@ def clear(self) -> None: self._instruments = {} with self._query_session_count_lock: self._query_session_count_values = {} + self._query_session_max_values = {} def add(self, name: str, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: """ @@ -122,6 +143,16 @@ def get_query_session_count_values(self) -> Dict[Any, int]: with self._query_session_count_lock: return dict(self._query_session_count_values) + def set_query_session_max(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: + attrs = tuple(sorted((attributes or {}).items())) + + with self._query_session_count_lock: + self._query_session_max_values[attrs] = value + + def get_query_session_max_values(self) -> Dict[Any, int]: + with self._query_session_count_lock: + return dict(self._query_session_max_values) + _metrics_registry = MetricsRegistry() @@ -175,6 +206,7 @@ def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None) -> No self._start_time = time.monotonic() self._exception: Optional[BaseException] = None self._ended = False + self._end_lock = threading.Lock() def set_error(self, exception: BaseException) -> None: """ @@ -186,17 +218,17 @@ def set_error(self, exception: BaseException) -> None: self._exception = exception def set_attribute(self, key: str, value: Any) -> None: - self._attributes[key] = value + if key in _OPERATION_ATTR_KEYS: + self._attributes[key] = value - def attach_context(self, end_on_exit=True) -> "MetricsOperation": - return self + def attach_context(self, end_on_exit=True) -> _MetricsOperationContext: + return _MetricsOperationContext(self, end_on_exit) def end(self) -> None: - # todo: consider multi-thread calling - - if self._ended: - return - self._ended = True + with self._end_lock: + if self._ended: + return + self._ended = True duration = time.monotonic() - self._start_time _metrics_registry.record(CLIENT_OPERATION_DURATION, duration, self._attributes) @@ -216,6 +248,23 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> bool: return False +class _MetricsOperationContext: + def __init__(self, operation: MetricsOperation, end_on_exit: bool) -> None: + self._operation = operation + self._end_on_exit = end_on_exit + + def __enter__(self) -> MetricsOperation: + return self._operation + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + if exc_val is not None: + self._operation.set_error(exc_val) + self._operation.end() + elif self._end_on_exit: + self._operation.end() + return False + + def create_metrics_operation(name: str, attributes: Optional[Dict[str, Any]] = None) -> MetricsOperation: return MetricsOperation(name, attributes) @@ -238,6 +287,10 @@ def record_query_session_timeout(pool_name: Optional[str]) -> None: _metrics_registry.add(QUERY_SESSION_TIMEOUTS, 1, _pool_attrs(pool_name)) +def record_query_session_max(value: int, pool_name: Optional[str]) -> None: + _metrics_registry.set_query_session_max(value, _pool_attrs(pool_name)) + + def record_retry_metrics(duration: float, attempts: int) -> None: _metrics_registry.record(RETRY_DURATION, duration) _metrics_registry.record(RETRY_ATTEMPTS, attempts) diff --git a/ydb/opentelemetry/plugin.py b/ydb/opentelemetry/plugin.py index 5dd9c996d..0a6983135 100644 --- a/ydb/opentelemetry/plugin.py +++ b/ydb/opentelemetry/plugin.py @@ -139,14 +139,25 @@ def _disable_tracing(): _tracing_enabled = False _tracer = None + +def _create_observable_callback(get_values): + """Create callback for observable metrics backed by the metrics registry.""" + + def observe(_): + values = get_values() + return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] + + return observe + + def _create_query_session_count_callback(): """Create callback for observable query session count metric.""" + return _create_observable_callback(_metrics_registry.get_query_session_count_values) - def observe_query_session_count(_): - values = _metrics_registry.get_query_session_count_values() - return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] - return observe_query_session_count +def _create_query_session_max_callback(): + """Create callback for observable query session max metric.""" + return _create_observable_callback(_metrics_registry.get_query_session_max_values) def _enable_metrics(meter_provider): @@ -162,7 +173,7 @@ def _enable_metrics(meter_provider): else: raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") - _metrics_registry.set_meter(_meter, _create_query_session_count_callback()) + _metrics_registry.set_meter(_meter, _create_query_session_count_callback(), _create_query_session_max_callback()) def _disable_metrics(): diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index f78d7e2b1..d006b0523 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -18,6 +18,7 @@ class SpanName(str, enum.Enum): RUN_WITH_RETRY = "ydb.RunWithRetry" TRY = "ydb.Try" + class _NoopCtx: __slots__ = ("_span",) @@ -32,7 +33,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): class _NoopSpan: - """Returned by create_ydb_span when tracing is disabled.""" + """Span-compatible object used when tracing is disabled.""" def set_error(self, exception): pass @@ -50,6 +51,52 @@ def attach_context(self, end_on_exit=True): _NOOP_SPAN = _NoopSpan() +class _TelemetryContext: + """Attach both tracing and metrics lifecycle contexts for one SDK operation.""" + + def __init__(self, telemetry, span_context, metrics_context): + self._telemetry = telemetry + self._span_context = span_context + self._metrics_context = metrics_context + + def __enter__(self): + self._metrics_context.__enter__() + self._span_context.__enter__() + return self._telemetry + + def __exit__(self, exc_type, exc_val, exc_tb): + span_result = self._span_context.__exit__(exc_type, exc_val, exc_tb) + metrics_result = self._metrics_context.__exit__(exc_type, exc_val, exc_tb) + return bool(span_result or metrics_result) + + +class _TelemetryOperation: + """Operation telemetry facade that fans lifecycle events out to tracing and metrics.""" + + def __init__(self, span, metrics): + self._span = span + self._metrics = metrics + + def set_error(self, exception): + self._span.set_error(exception) + self._metrics.set_error(exception) + + def set_attribute(self, key, value): + self._span.set_attribute(key, value) + self._metrics.set_attribute(key, value) + + def end(self): + self._span.end() + self._metrics.end() + + def attach_context(self, end_on_exit=True): + return _TelemetryContext( + self, + self._span.attach_context(end_on_exit=end_on_exit), + self._metrics.attach_context(end_on_exit=end_on_exit), + ) + + class OtelTracingRegistry: """Singleton registry for OpenTelemetry tracing. @@ -108,14 +155,18 @@ def _split_endpoint(endpoint: Optional[str]) -> Tuple[str, int]: return host, int(port_s) if port_s.isdigit() else 0 -def _build_ydb_attrs(driver_config, node_id=None, peer=None): +def _build_ydb_attrs(driver_config): host, port = _split_endpoint(getattr(driver_config, "endpoint", None)) - attrs = { + return { "db.system.name": "ydb", "db.namespace": getattr(driver_config, "database", None) or "", "server.address": host, "server.port": port, } + + +def _build_ydb_tracing_attrs(driver_config, node_id=None, peer=None): + attrs = _build_ydb_attrs(driver_config) if peer is not None: address, port_, location = peer if address is not None: @@ -135,11 +186,15 @@ def create_span(name, attributes=None, kind="internal"): def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): - """Create a span pre-filled with standard YDB attributes.""" - attrs = _build_ydb_attrs(driver_config, node_id, peer) - if not _registry.is_active(): - return create_metrics_operation(name, attrs) - return _registry.create_span(name, attributes=attrs, kind=kind) + """Create telemetry for one user-visible YDB client operation. + + Tracing receives full operation context, including peer/node details. Metrics + receive only the stable labels defined for client operation metrics. + """ + metrics_attrs = _build_ydb_attrs(driver_config) + tracing_attrs = _build_ydb_tracing_attrs(driver_config, node_id, peer) + metrics = create_metrics_operation(name, metrics_attrs) + return _TelemetryOperation(_registry.create_span(name, attributes=tracing_attrs, kind=kind), metrics) def set_peer_attributes(span, peer): diff --git a/ydb/query/pool.py b/ydb/query/pool.py index e17a451b7..8445769ef 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -31,6 +31,7 @@ from ..opentelemetry.metrics import ( record_query_session_count, record_query_session_create_time, + record_query_session_max, record_query_session_pending_requests, record_query_session_timeout, ) @@ -75,6 +76,7 @@ def __init__( self._lock = threading.RLock() self._query_client_settings = query_client_settings self._metrics_pool_name = name or "query-session-pool-%d" % next(_pool_name_counter) + record_query_session_max(self._size, self._metrics_pool_name) def _create_new_session(self, timeout: Optional[float]): session = QuerySession(self._driver, settings=self._query_client_settings) From a9caeb90e790336c940803076f0179bbdb117708 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 15 May 2026 22:41:46 +0300 Subject: [PATCH 07/30] rewrite example --- examples/opentelemetry/otel_example.py | 26 ++++-- .../opentelemetry/otel_metrics_example.py | 91 ------------------- 2 files changed, 20 insertions(+), 97 deletions(-) delete mode 100644 examples/opentelemetry/otel_metrics_example.py diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 6ec0c5a84..db6f6089e 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -8,8 +8,13 @@ import asyncio import os + +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics._internal.export import PeriodicExportingMetricReader + import ydb -from ydb.opentelemetry import enable_tracing +from ydb.opentelemetry import enable_tracing, enable_registry, disable_registry from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource @@ -45,13 +50,20 @@ async def main() -> None: otlp_endpoint = _env("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") resource = Resource(attributes={"service.name": _env("OTEL_SERVICE_NAME", "ydb-otel-example")}) - provider = TracerProvider(resource=resource) - provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint=otlp_endpoint))) - trace.set_tracer_provider(provider) + tracer_provider = TracerProvider(resource=resource) + tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint=otlp_endpoint))) + trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer(__name__) enable_tracing(tracer) + metric_reader = PeriodicExportingMetricReader( + OTLPMetricExporter(endpoint=otlp_endpoint), + export_interval_millis=1000, + ) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + enable_registry(meter_provider) + async with ydb.aio.Driver( endpoint=endpoint, database=database, @@ -84,8 +96,10 @@ async def concurrent_task(task_num: int) -> None: final_rows = await pool.execute_with_retries("SELECT amount FROM bank WHERE id = 1") amount = int(list(final_rows[0].rows)[0]["amount"]) print(f"Final amount (after serializable retries): {amount}") - - provider.shutdown() + print("Application will shut down in 15 seconds...") + await asyncio.sleep(15) + tracer_provider.shutdown() + meter_provider.shutdown() if __name__ == "__main__": diff --git a/examples/opentelemetry/otel_metrics_example.py b/examples/opentelemetry/otel_metrics_example.py deleted file mode 100644 index fb893b29e..000000000 --- a/examples/opentelemetry/otel_metrics_example.py +++ /dev/null @@ -1,91 +0,0 @@ -"""OpenTelemetry metrics demo for YDB client-side metrics. - -The example exports SDK metrics to the OpenTelemetry Collector via OTLP. The -collector exposes them for Prometheus, which is configured in compose-e2e.yaml. -""" - -from __future__ import annotations - -import asyncio -import os -import signal -from types import FrameType -from typing import Callable, Optional - -import ydb -from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from opentelemetry.sdk.resources import Resource -from ydb.opentelemetry import enable_registry - - -def _env(name: str, default: str) -> str: - value = os.environ.get(name) - return value if value else default - - -def _create_stop_event() -> asyncio.Event: - stop = asyncio.Event() - loop = asyncio.get_running_loop() - request_stop: Callable[[], None] = stop.set - handle_stop_signal: Callable[[int, Optional[FrameType]], None] = lambda signum, frame: stop.set() - - for sig in (signal.SIGINT, signal.SIGTERM): - try: - loop.add_signal_handler(sig, request_stop) - except NotImplementedError: - signal.signal(sig, handle_stop_signal) - - return stop - - -async def _run_workload(pool: ydb.aio.QuerySessionPool, stop: asyncio.Event) -> None: - counter = 0 - while not stop.is_set(): - counter += 1 - result_sets = await asyncio.gather( - *( - pool.execute_with_retries( - "SELECT $session_id AS session_id, $iteration AS iteration", - parameters={ - "$session_id": (i, ydb.PrimitiveType.Uint64), - "$iteration": (counter, ydb.PrimitiveType.Uint64), - }, - ) - for i in range(4) - ) - ) - session_ids = [int(list(result[0].rows)[0]["session_id"]) for result in result_sets] - print(f"completed concurrent queries: {session_ids}") - await asyncio.sleep(2) - - -async def main() -> None: - endpoint = _env("YDB_ENDPOINT", "grpc://localhost:2136") - database = _env("YDB_DATABASE", "/local") - otlp_endpoint = _env("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") - - resource = Resource(attributes={"service.name": _env("OTEL_SERVICE_NAME", "ydb-client-metrics-example")}) - metric_reader = PeriodicExportingMetricReader( - OTLPMetricExporter(endpoint=otlp_endpoint), - export_interval_millis=1000, - ) - meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) - enable_registry(meter_provider) - - stop = _create_stop_event() - - try: - async with ydb.aio.Driver(endpoint=endpoint, database=database, disable_discovery=True) as driver: - await driver.wait(timeout=60) - - async with ydb.aio.QuerySessionPool(driver, size=4) as pool: - print("YDB client metrics are being exported. Open Prometheus and query ydb_query_session_count.") - await _run_workload(pool, stop) - finally: - meter_provider.shutdown() - - -if __name__ == "__main__": - asyncio.run(main()) From 751ea87ccf091f9ef2f099d61be157dc9394b9a2 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 15 May 2026 22:42:12 +0300 Subject: [PATCH 08/30] refactoring --- tests/opentelemetry/conftest.py | 21 ++- tests/opentelemetry/test_metrics.py | 202 +++++++++------------- tests/opentelemetry/test_tracing_async.py | 64 +++---- tests/opentelemetry/test_tracing_sync.py | 88 +++++----- ydb/aio/query/pool.py | 6 +- ydb/opentelemetry/metrics.py | 8 + ydb/query/pool.py | 6 +- 7 files changed, 185 insertions(+), 210 deletions(-) diff --git a/tests/opentelemetry/conftest.py b/tests/opentelemetry/conftest.py index 26c39cef1..854f8ccd3 100644 --- a/tests/opentelemetry/conftest.py +++ b/tests/opentelemetry/conftest.py @@ -7,6 +7,8 @@ import pytest from opentelemetry import trace +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import InMemoryMetricReader from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import SimpleSpanProcessor from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter @@ -18,7 +20,7 @@ @pytest.fixture() -def otel_setup(): +def tracing_setup(): """Enable SDK tracing, yield the exporter, then restore noop defaults. Each test gets a clean exporter (cleared before and after). @@ -37,6 +39,23 @@ def otel_setup(): _exporter.clear() +@pytest.fixture() +def metrics_setup(): + """Enable SDK metrics with an in-memory reader, then restore noop defaults.""" + from ydb.opentelemetry import disable_registry, enable_registry + + reader = InMemoryMetricReader() + provider = MeterProvider(metric_readers=[reader]) + + disable_registry() + enable_registry(provider) + try: + yield reader + finally: + disable_registry() + provider.shutdown() + + class FakeDriverConfig: def __init__(self, endpoint="test_endpoint:1337", database="/test_database"): self.endpoint = endpoint diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index e0a57e948..0605e98d6 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -1,31 +1,6 @@ from unittest.mock import MagicMock import pytest -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import InMemoryMetricReader - - -@pytest.fixture() -def metrics_reader(): - from ydb.opentelemetry.metrics import _metrics_registry - from ydb.opentelemetry.plugin import _create_query_session_count_callback - - reader = InMemoryMetricReader() - provider = MeterProvider(metric_readers=[reader]) - meter = provider.get_meter("ydb.sdk") - - from ydb.opentelemetry.plugin import _create_query_session_max_callback - - _metrics_registry.set_meter( - meter, - _create_query_session_count_callback(), - _create_query_session_max_callback(), - ) - try: - yield reader - finally: - _metrics_registry.clear() - provider.shutdown() def _metrics_by_name(reader): @@ -56,7 +31,7 @@ def _sum_value(reader, name): return _single_point(reader, name).value -def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): +def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): from ydb import issues from ydb.opentelemetry.metrics import ( CLIENT_OPERATION_DURATION, @@ -90,7 +65,7 @@ def test_metrics_registry_records_all_instruments(metrics_reader, monkeypatch): record_query_session_timeout("main") record_retry_metrics(0.75, 3) - metrics = _metrics_by_name(metrics_reader) + metrics = _metrics_by_name(metrics_setup) assert set(metrics) == { CLIENT_OPERATION_DURATION, @@ -135,7 +110,7 @@ def test_metrics_registry_is_noop_without_meter(): pass -def test_metrics_operation_records_duration_once(metrics_reader, monkeypatch): +def test_metrics_operation_records_duration_once(metrics_setup, monkeypatch): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation monotonic = MagicMock(side_effect=[10.0, 10.25, 11.0]) @@ -152,7 +127,7 @@ def test_metrics_operation_records_duration_once(metrics_reader, monkeypatch): operation.end() operation.end() - point = _single_point(metrics_reader, CLIENT_OPERATION_DURATION) + point = _single_point(metrics_setup, CLIENT_OPERATION_DURATION) assert point.sum == 0.25 assert point.count == 1 @@ -165,7 +140,7 @@ def test_metrics_operation_records_duration_once(metrics_reader, monkeypatch): } -def test_metrics_operation_records_ydb_error(metrics_reader, monkeypatch): +def test_metrics_operation_records_ydb_error(metrics_setup, monkeypatch): from ydb import issues from ydb.opentelemetry.metrics import CLIENT_OPERATION_FAILED, create_metrics_operation @@ -175,34 +150,34 @@ def test_metrics_operation_records_ydb_error(metrics_reader, monkeypatch): with create_metrics_operation("ExecuteQuery"): raise issues.Unavailable("transient") - point = _single_point(metrics_reader, CLIENT_OPERATION_FAILED) + point = _single_point(metrics_setup, CLIENT_OPERATION_FAILED) assert point.value == 1 assert point.attributes["db.response.status_code"] == "UNAVAILABLE" assert point.attributes["ydb.operation.name"] == "ExecuteQuery" -def test_metrics_operation_records_generic_error_status_code(metrics_reader): +def test_metrics_operation_records_generic_error_status_code(metrics_setup): from ydb.opentelemetry.metrics import CLIENT_OPERATION_FAILED, create_metrics_operation with pytest.raises(ValueError): with create_metrics_operation("ExecuteQuery"): raise ValueError("bad value") - assert _single_point(metrics_reader, CLIENT_OPERATION_FAILED).attributes["db.response.status_code"] == "ValueError" + assert _single_point(metrics_setup, CLIENT_OPERATION_FAILED).attributes["db.response.status_code"] == "ValueError" -def test_metrics_operation_set_attribute(metrics_reader): +def test_metrics_operation_set_attribute(metrics_setup): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation operation = create_metrics_operation("ExecuteQuery") operation.set_attribute("db.namespace", "/Root/test") operation.end() - assert _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes["db.namespace"] == "/Root/test" + assert _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes["db.namespace"] == "/Root/test" -def test_metrics_operation_ignores_non_metric_attributes(metrics_reader): +def test_metrics_operation_ignores_non_metric_attributes(metrics_setup): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation operation = create_metrics_operation("ExecuteQuery") @@ -212,7 +187,7 @@ def test_metrics_operation_ignores_non_metric_attributes(metrics_reader): operation.set_attribute("ydb.node.id", 123) operation.end() - attrs = _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes + attrs = _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes assert "network.peer.address" not in attrs assert "network.peer.port" not in attrs @@ -220,83 +195,46 @@ def test_metrics_operation_ignores_non_metric_attributes(metrics_reader): assert "ydb.node.id" not in attrs -def test_metrics_operation_respects_end_on_exit_false(metrics_reader): +def test_metrics_operation_respects_end_on_exit_false(metrics_setup): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation operation = create_metrics_operation("ExecuteQuery") with operation.attach_context(end_on_exit=False): pass - assert CLIENT_OPERATION_DURATION not in _metrics_by_name(metrics_reader) + assert CLIENT_OPERATION_DURATION not in _metrics_by_name(metrics_setup) operation.end() - point = _single_point(metrics_reader, CLIENT_OPERATION_DURATION) + point = _single_point(metrics_setup, CLIENT_OPERATION_DURATION) assert point.count == 1 assert point.sum >= 0 -def test_create_ydb_span_records_metrics_when_tracing_is_active(metrics_reader): +def test_create_ydb_span_records_metrics_when_tracing_is_active(metrics_setup, tracing_setup): from tests.opentelemetry.conftest import FakeDriverConfig from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION - from ydb.opentelemetry.tracing import _registry, create_ydb_span + from ydb.opentelemetry.tracing import create_ydb_span + + exporter = tracing_setup + + with create_ydb_span( + "ydb.ExecuteQuery", + FakeDriverConfig(), + node_id=123, + peer=("node.example.net", 2136, "dc-a"), + ).attach_context(): + pass - class FakeSpan: - def __init__(self): - self.ended = False - self.attributes = {} - self._end_on_exit = True - - def set_error(self, exception): - pass - - def set_attribute(self, key, value): - self.attributes[key] = value - - def end(self): - self.ended = True - - def attach_context(self, end_on_exit=True): - self._end_on_exit = end_on_exit - return self - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if exc_val is not None or self._end_on_exit: - self.end() - return False - - created_spans = [] - - def create_span(name, attributes=None, kind=None): - span = FakeSpan() - span.attributes.update(attributes or {}) - created_spans.append(span) - return span - - try: - _registry.set_create_span(create_span) - - with create_ydb_span( - "ydb.ExecuteQuery", - FakeDriverConfig(), - node_id=123, - peer=("node.example.net", 2136, "dc-a"), - ).attach_context(): - pass - finally: - _registry.set_create_span(None) - - assert len(created_spans) == 1 - assert created_spans[0].ended is True - assert created_spans[0].attributes["network.peer.address"] == "node.example.net" - assert created_spans[0].attributes["network.peer.port"] == 2136 - assert created_spans[0].attributes["ydb.node.dc"] == "dc-a" - assert created_spans[0].attributes["ydb.node.id"] == 123 - - metric_attrs = _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes + spans = exporter.get_finished_spans() + assert len(spans) == 1 + span_attrs = dict(spans[0].attributes) + assert span_attrs["network.peer.address"] == "node.example.net" + assert span_attrs["network.peer.port"] == 2136 + assert span_attrs["ydb.node.dc"] == "dc-a" + assert span_attrs["ydb.node.id"] == 123 + + metric_attrs = _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes assert metric_attrs["ydb.operation.name"] == "ydb.ExecuteQuery" assert "network.peer.address" not in metric_attrs assert "network.peer.port" not in metric_attrs @@ -304,7 +242,7 @@ def create_span(name, attributes=None, kind=None): assert "ydb.node.id" not in metric_attrs -def test_create_ydb_span_records_metrics_when_tracing_is_disabled(metrics_reader): +def test_create_ydb_span_records_metrics_when_tracing_is_disabled(metrics_setup): from tests.opentelemetry.conftest import FakeDriverConfig from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION from ydb.opentelemetry.tracing import _registry, create_ydb_span @@ -315,18 +253,18 @@ def test_create_ydb_span_records_metrics_when_tracing_is_disabled(metrics_reader pass assert ( - _single_point(metrics_reader, CLIENT_OPERATION_DURATION).attributes["ydb.operation.name"] == "ydb.ExecuteQuery" + _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes["ydb.operation.name"] == "ydb.ExecuteQuery" ) -def test_query_session_count_accumulates_by_attributes(metrics_reader): +def test_query_session_count_accumulates_by_attributes(metrics_setup): from ydb.opentelemetry.metrics import QUERY_SESSION_COUNT, record_query_session_count record_query_session_count(1, "main", "used") record_query_session_count(2, "main", "used") record_query_session_count(1, None, "idle") - metric = _metrics_by_name(metrics_reader)[QUERY_SESSION_COUNT] + metric = _metrics_by_name(metrics_setup)[QUERY_SESSION_COUNT] values = {tuple(sorted(point.attributes.items())): point.value for point in metric.data.data_points} assert ( @@ -349,7 +287,7 @@ def test_query_session_count_accumulates_by_attributes(metrics_reader): ) -def test_query_session_helpers_record_pool_attributes(metrics_reader): +def test_query_session_helpers_record_pool_attributes(metrics_setup): from ydb.opentelemetry.metrics import ( QUERY_SESSION_CREATE_TIME, QUERY_SESSION_MAX, @@ -366,42 +304,56 @@ def test_query_session_helpers_record_pool_attributes(metrics_reader): record_query_session_pending_requests(1, None) record_query_session_timeout("main") - assert _histogram_sum(metrics_reader, QUERY_SESSION_CREATE_TIME) == 0.5 - assert _single_point(metrics_reader, QUERY_SESSION_CREATE_TIME).attributes == { - "ydb.query.session.pool.name": "main" - } - assert _sum_value(metrics_reader, QUERY_SESSION_PENDING_REQUESTS) == 1 - assert _single_point(metrics_reader, QUERY_SESSION_PENDING_REQUESTS).attributes == { + assert _histogram_sum(metrics_setup, QUERY_SESSION_CREATE_TIME) == 0.5 + assert _single_point(metrics_setup, QUERY_SESSION_CREATE_TIME).attributes == {"ydb.query.session.pool.name": "main"} + assert _sum_value(metrics_setup, QUERY_SESSION_PENDING_REQUESTS) == 1 + assert _single_point(metrics_setup, QUERY_SESSION_PENDING_REQUESTS).attributes == { "ydb.query.session.pool.name": "unknown" } - assert _sum_value(metrics_reader, QUERY_SESSION_TIMEOUTS) == 1 - assert _single_point(metrics_reader, QUERY_SESSION_TIMEOUTS).attributes == {"ydb.query.session.pool.name": "main"} - assert _single_point(metrics_reader, QUERY_SESSION_MAX).value == 100 - assert _single_point(metrics_reader, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "main"} + assert _sum_value(metrics_setup, QUERY_SESSION_TIMEOUTS) == 1 + assert _single_point(metrics_setup, QUERY_SESSION_TIMEOUTS).attributes == {"ydb.query.session.pool.name": "main"} + assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 100 + assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "main"} -def test_sync_query_session_pool_records_max(metrics_reader): +def test_sync_query_session_pool_records_max(metrics_setup): from ydb.opentelemetry.metrics import QUERY_SESSION_MAX from ydb.query.pool import QuerySessionPool QuerySessionPool(driver=object(), size=42, name="sync-pool") - assert _single_point(metrics_reader, QUERY_SESSION_MAX).value == 42 - assert _single_point(metrics_reader, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "sync-pool"} + assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 42 + assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "sync-pool"} @pytest.mark.asyncio -async def test_async_query_session_pool_records_max(metrics_reader): +async def test_async_query_session_pool_records_max(metrics_setup): from ydb.aio.query.pool import QuerySessionPool from ydb.opentelemetry.metrics import QUERY_SESSION_MAX QuerySessionPool(driver=object(), size=24, name="async-pool") - assert _single_point(metrics_reader, QUERY_SESSION_MAX).value == 24 - assert _single_point(metrics_reader, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "async-pool"} + assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 24 + assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "async-pool"} + + +@pytest.mark.asyncio +async def test_sync_and_async_query_session_pool_auto_names_do_not_collide(metrics_setup): + from ydb.aio.query.pool import QuerySessionPool as AsyncQuerySessionPool + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + from ydb.query.pool import QuerySessionPool + + QuerySessionPool(driver=object(), size=42) + AsyncQuerySessionPool(driver=object(), size=24) + + metric = _metrics_by_name(metrics_setup)[QUERY_SESSION_MAX] + values = {point.attributes["ydb.query.session.pool.name"]: point.value for point in metric.data.data_points} + + assert len(values) == 2 + assert sorted(values.values()) == [24, 42] -def test_retry_operation_sync_records_retry_metrics(metrics_reader): +def test_retry_operation_sync_records_retry_metrics(metrics_setup): from ydb import issues from ydb.opentelemetry.metrics import RETRY_ATTEMPTS, RETRY_DURATION from ydb.retries import RetrySettings, retry_operation_sync @@ -416,11 +368,11 @@ def flaky(): assert retry_operation_sync(flaky, RetrySettings(max_retries=5)) == "ok" - duration = _single_point(metrics_reader, RETRY_DURATION) + duration = _single_point(metrics_setup, RETRY_DURATION) assert duration.count == 1 assert duration.sum >= 0 assert duration.attributes == {} - assert _histogram_sum(metrics_reader, RETRY_ATTEMPTS) == 3 + assert _histogram_sum(metrics_setup, RETRY_ATTEMPTS) == 3 async def _async_value(): @@ -428,14 +380,14 @@ async def _async_value(): @pytest.mark.asyncio -async def test_retry_operation_async_records_retry_metrics(metrics_reader): +async def test_retry_operation_async_records_retry_metrics(metrics_setup): from ydb.opentelemetry.metrics import RETRY_ATTEMPTS, RETRY_DURATION from ydb.retries import retry_operation_async assert await retry_operation_async(_async_value) == "ok" - duration = _single_point(metrics_reader, RETRY_DURATION) + duration = _single_point(metrics_setup, RETRY_DURATION) assert duration.count == 1 assert duration.sum >= 0 assert duration.attributes == {} - assert _histogram_sum(metrics_reader, RETRY_ATTEMPTS) == 1 + assert _histogram_sum(metrics_setup, RETRY_ATTEMPTS) == 1 diff --git a/tests/opentelemetry/test_tracing_async.py b/tests/opentelemetry/test_tracing_async.py index 6b4e96ad1..af2b369b6 100644 --- a/tests/opentelemetry/test_tracing_async.py +++ b/tests/opentelemetry/test_tracing_async.py @@ -70,8 +70,8 @@ def _make_fresh_async_tx(session, driver): class TestAsyncCreateSessionSpan: @pytest.mark.asyncio - async def test_create_session_emits_span(self, otel_setup): - exporter = otel_setup + async def test_create_session_emits_span(self, tracing_setup): + exporter = tracing_setup from ydb.aio.query.session import QuerySession @@ -95,8 +95,8 @@ async def test_create_session_emits_span(self, otel_setup): assert attrs["server.address"] == "test_endpoint" assert attrs["server.port"] == 1337 - def test_async_connection_peer_attributes_are_resolved(self, otel_setup): - exporter = otel_setup + def test_async_connection_peer_attributes_are_resolved(self, tracing_setup): + exporter = tracing_setup from ydb.aio.connection import Connection from ydb.connection import EndpointOptions @@ -140,8 +140,8 @@ def test_async_connection_peer_attributes_are_resolved(self, otel_setup): class TestAsyncExecuteQuerySpan: @pytest.mark.asyncio - async def test_session_execute_emits_span(self, otel_setup): - exporter = otel_setup + async def test_session_execute_emits_span(self, tracing_setup): + exporter = tracing_setup from ydb.aio.query.session import QuerySession @@ -169,8 +169,8 @@ async def test_session_execute_emits_span(self, otel_setup): assert "ydb.session.id" not in attrs @pytest.mark.asyncio - async def test_tx_execute_emits_span(self, otel_setup): - exporter = otel_setup + async def test_tx_execute_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) @@ -192,8 +192,8 @@ async def test_tx_execute_emits_span(self, otel_setup): class TestAsyncBeginTransactionSpan: @pytest.mark.asyncio - async def test_begin_emits_span(self, otel_setup): - exporter = otel_setup + async def test_begin_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_fresh_async_tx(session, driver) @@ -214,10 +214,10 @@ async def test_begin_emits_span(self, otel_setup): assert span.status.status_code == StatusCode.UNSET @pytest.mark.asyncio - async def test_begin_sets_error_status_on_failure(self, otel_setup): + async def test_begin_sets_error_status_on_failure(self, tracing_setup): from ydb import issues - exporter = otel_setup + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_fresh_async_tx(session, driver) @@ -236,8 +236,8 @@ async def test_begin_sets_error_status_on_failure(self, otel_setup): class TestAsyncCommitSpan: @pytest.mark.asyncio - async def test_commit_emits_span(self, otel_setup): - exporter = otel_setup + async def test_commit_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) @@ -255,8 +255,8 @@ async def test_commit_emits_span(self, otel_setup): class TestAsyncRollbackSpan: @pytest.mark.asyncio - async def test_rollback_emits_span(self, otel_setup): - exporter = otel_setup + async def test_rollback_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) @@ -279,10 +279,10 @@ class TestAsyncCommitRollbackErrorRecording: """ @pytest.mark.asyncio - async def test_commit_records_exception_on_failure(self, otel_setup): + async def test_commit_records_exception_on_failure(self, tracing_setup): from ydb import issues - exporter = otel_setup + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) @@ -299,10 +299,10 @@ async def test_commit_records_exception_on_failure(self, otel_setup): assert any(e.name == "exception" for e in span.events) @pytest.mark.asyncio - async def test_rollback_records_exception_on_failure(self, otel_setup): + async def test_rollback_records_exception_on_failure(self, tracing_setup): from ydb import issues - exporter = otel_setup + exporter = tracing_setup session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) @@ -321,8 +321,8 @@ async def test_rollback_records_exception_on_failure(self, otel_setup): class TestAsyncErrorHandling: @pytest.mark.asyncio - async def test_error_sets_error_status_and_attributes(self, otel_setup): - exporter = otel_setup + async def test_error_sets_error_status_and_attributes(self, tracing_setup): + exporter = tracing_setup from ydb import issues @@ -353,10 +353,10 @@ async def test_error_sets_error_status_and_attributes(self, otel_setup): class TestAsyncRetryPolicySpans: @pytest.mark.asyncio - async def test_success_emits_single_try(self, otel_setup): + async def test_success_emits_single_try(self, tracing_setup): from ydb.retries import retry_operation_async - exporter = otel_setup + exporter = tracing_setup async def callee(): return 7 @@ -373,12 +373,12 @@ async def callee(): assert tries[0].status.status_code == StatusCode.UNSET @pytest.mark.asyncio - async def test_retry_failed_tries_set_error_status(self, otel_setup): + async def test_retry_failed_tries_set_error_status(self, tracing_setup): """Failed async attempts must set ``ydb.Try`` status to ERROR (not UNSET).""" from ydb import issues from ydb.retries import BackoffSettings, RetrySettings, retry_operation_async - exporter = otel_setup + exporter = tracing_setup counter = {"n": 0} async def flaky(): @@ -402,14 +402,14 @@ async def flaky(): assert tries[2].status.status_code == StatusCode.UNSET @pytest.mark.asyncio - async def test_context_cancel_during_backoff_records_exception(self, otel_setup): + async def test_context_cancel_during_backoff_records_exception(self, tracing_setup): """Inter-attempt sleep is outside ``ydb.Try``; cancellation during ``asyncio.sleep`` is recorded on ``ydb.RunWithRetry`` (``record_exception``). """ from ydb import issues from ydb.retries import BackoffSettings, RetrySettings, retry_operation_async - exporter = otel_setup + exporter = tracing_setup calls = {"n": 0} async def flaky(): @@ -442,7 +442,7 @@ async def flaky(): class TestAsyncRetrySpanNesting: @pytest.mark.asyncio - async def test_execute_query_is_child_of_try_under_run_with_retry(self, otel_setup): + async def test_execute_query_is_child_of_try_under_run_with_retry(self, tracing_setup): """``ydb.RunWithRetry`` -> ``ydb.Try`` -> ``ydb.ExecuteQuery`` (deep nesting). The previous implementation produced sibling spans because ``ydb.Try`` was @@ -452,7 +452,7 @@ async def test_execute_query_is_child_of_try_under_run_with_retry(self, otel_set from ydb.aio.query.session import QuerySession from ydb.retries import retry_operation_async - exporter = otel_setup + exporter = tracing_setup qs = QuerySession.__new__(QuerySession) cfg = FakeDriverConfig() @@ -485,9 +485,9 @@ async def callee(): class TestAsyncConcurrentSpansIsolation: @pytest.mark.asyncio - async def test_parallel_executes_do_not_become_parent_child(self, otel_setup): + async def test_parallel_executes_do_not_become_parent_child(self, tracing_setup): """Two concurrent execute calls must produce sibling spans, not parent-child.""" - exporter = otel_setup + exporter = tracing_setup from ydb.aio.query.session import QuerySession diff --git a/tests/opentelemetry/test_tracing_sync.py b/tests/opentelemetry/test_tracing_sync.py index 38e5b8ce0..1e9af1c78 100644 --- a/tests/opentelemetry/test_tracing_sync.py +++ b/tests/opentelemetry/test_tracing_sync.py @@ -67,8 +67,8 @@ def _make_fresh_tx(session, driver): class TestCreateSessionSpan: - def test_create_session_emits_span(self, otel_setup): - exporter = otel_setup + def test_create_session_emits_span(self, tracing_setup): + exporter = tracing_setup from ydb.query.session import QuerySession @@ -95,8 +95,8 @@ def test_create_session_emits_span(self, otel_setup): class TestExecuteQuerySpan: - def test_session_execute_emits_span(self, otel_setup): - exporter = otel_setup + def test_session_execute_emits_span(self, tracing_setup): + exporter = tracing_setup from ydb.query.session import QuerySession @@ -130,8 +130,8 @@ def test_session_execute_emits_span(self, otel_setup): assert "ydb.session.id" not in attrs assert "ydb.tx.id" not in attrs - def test_tx_execute_emits_span(self, otel_setup): - exporter = otel_setup + def test_tx_execute_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) @@ -152,8 +152,8 @@ def test_tx_execute_emits_span(self, otel_setup): class TestBeginTransactionSpan: - def test_begin_emits_span(self, otel_setup): - exporter = otel_setup + def test_begin_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_fresh_tx(session, driver) @@ -173,10 +173,10 @@ def test_begin_emits_span(self, otel_setup): assert "ydb.tx.id" not in attrs assert span.status.status_code == StatusCode.UNSET - def test_begin_sets_error_status_on_failure(self, otel_setup): + def test_begin_sets_error_status_on_failure(self, tracing_setup): from ydb import issues - exporter = otel_setup + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_fresh_tx(session, driver) @@ -194,8 +194,8 @@ def test_begin_sets_error_status_on_failure(self, otel_setup): class TestCommitSpan: - def test_commit_emits_span(self, otel_setup): - exporter = otel_setup + def test_commit_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) @@ -214,8 +214,8 @@ def test_commit_emits_span(self, otel_setup): class TestRollbackSpan: - def test_rollback_emits_span(self, otel_setup): - exporter = otel_setup + def test_rollback_emits_span(self, tracing_setup): + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) @@ -240,10 +240,10 @@ class TestCommitRollbackErrorRecording: - have the exception recorded as a span event (``record_exception``) """ - def test_commit_records_exception_on_failure(self, otel_setup): + def test_commit_records_exception_on_failure(self, tracing_setup): from ydb import issues - exporter = otel_setup + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) @@ -259,10 +259,10 @@ def test_commit_records_exception_on_failure(self, otel_setup): assert attrs["db.response.status_code"] == "ABORTED" assert any(e.name == "exception" for e in span.events) - def test_rollback_records_exception_on_failure(self, otel_setup): + def test_rollback_records_exception_on_failure(self, tracing_setup): from ydb import issues - exporter = otel_setup + exporter = tracing_setup session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) @@ -280,8 +280,8 @@ def test_rollback_records_exception_on_failure(self, otel_setup): class TestErrorHandling: - def test_error_sets_error_status_and_attributes(self, otel_setup): - exporter = otel_setup + def test_error_sets_error_status_and_attributes(self, tracing_setup): + exporter = tracing_setup from ydb import issues @@ -327,8 +327,8 @@ def test_no_spans_without_enable_tracing(self): class TestParentChildRelationship: - def test_sdk_span_is_child_of_user_span(self, otel_setup): - exporter = otel_setup + def test_sdk_span_is_child_of_user_span(self, tracing_setup): + exporter = tracing_setup tracer = trace.get_tracer("test.tracer") @@ -346,7 +346,7 @@ def test_sdk_span_is_child_of_user_span(self, otel_setup): class TestTraceMetadataInjection: - def test_get_trace_metadata_returns_traceparent(self, otel_setup): + def test_get_trace_metadata_returns_traceparent(self, tracing_setup): from ydb.opentelemetry.tracing import get_trace_metadata tracer = trace.get_tracer("test.tracer") @@ -359,8 +359,8 @@ def test_get_trace_metadata_returns_traceparent(self, otel_setup): class TestDriverInitializeSpan: - def test_driver_initialize_emits_internal_span(self, otel_setup): - exporter = otel_setup + def test_driver_initialize_emits_internal_span(self, tracing_setup): + exporter = tracing_setup cfg = FakeDriverConfig() @@ -383,8 +383,8 @@ class TestCommonAttributes: ("[::1]:2136", "[::1]", 2136), ], ) - def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_port): - exporter = otel_setup + def test_endpoint_parsing(self, tracing_setup, endpoint, expected_host, expected_port): + exporter = tracing_setup cfg = FakeDriverConfig(endpoint=endpoint, database="/mydb") with create_ydb_span("ydb.Test", cfg).attach_context(): @@ -396,8 +396,8 @@ def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_po assert attrs["server.port"] == expected_port assert attrs["db.namespace"] == "/mydb" - def test_peer_attributes_are_optional(self, otel_setup): - exporter = otel_setup + def test_peer_attributes_are_optional(self, tracing_setup): + exporter = tracing_setup cfg = FakeDriverConfig() with create_ydb_span("ydb.Test", cfg).attach_context(): @@ -408,8 +408,8 @@ def test_peer_attributes_are_optional(self, otel_setup): assert "network.peer.address" not in attrs assert "network.peer.port" not in attrs - def test_peer_attributes_emitted_when_known(self, otel_setup): - exporter = otel_setup + def test_peer_attributes_emitted_when_known(self, tracing_setup): + exporter = tracing_setup cfg = FakeDriverConfig() with create_ydb_span("ydb.Test", cfg, peer=("peer.example.com", 2137, "dc-west")).attach_context(): @@ -423,7 +423,7 @@ def test_peer_attributes_emitted_when_known(self, otel_setup): class TestPeerFromEndpointMap: - def test_wrapper_create_session_pulls_peer_from_store(self, otel_setup): + def test_wrapper_create_session_pulls_peer_from_store(self, tracing_setup): """wrapper_create_session must resolve peer (host, port, dc) via the driver's connections_by_node_id cache, not via the grpc target string of the rpc call. """ @@ -454,10 +454,10 @@ def test_wrapper_create_session_pulls_peer_from_store(self, otel_setup): class TestRetryPolicySpans: - def test_success_on_first_try_emits_single_try(self, otel_setup): + def test_success_on_first_try_emits_single_try(self, tracing_setup): from ydb.retries import retry_operation_sync - exporter = otel_setup + exporter = tracing_setup def callee(): return 42 @@ -474,12 +474,12 @@ def callee(): assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) assert tries[0].parent.span_id == run.context.span_id - def test_retry_backoff_ms_on_each_try(self, otel_setup): + def test_retry_backoff_ms_on_each_try(self, tracing_setup): from ydb import issues from ydb.retries import retry_operation_sync from ydb.retries import RetrySettings, BackoffSettings - exporter = otel_setup + exporter = tracing_setup counter = {"n": 0} def flaky(): @@ -509,7 +509,7 @@ def flaky(): assert tries[1].status.status_code == StatusCode.ERROR assert tries[2].status.status_code == StatusCode.UNSET - def test_backoff_ms_attribute_matches_actual_sleep(self, otel_setup, monkeypatch): + def test_backoff_ms_attribute_matches_actual_sleep(self, tracing_setup, monkeypatch): """Pin the closure: ``ydb.retry.backoff_ms`` on the n-th ``ydb.Try`` equals the sleep that preceded it, regardless of which retry attempt triggered it. @@ -528,7 +528,7 @@ def test_backoff_ms_attribute_matches_actual_sleep(self, otel_setup, monkeypatch sleeps = [] monkeypatch.setattr("time.sleep", sleeps.append) - exporter = otel_setup + exporter = tracing_setup counter = {"n": 0} def flaky(): @@ -553,12 +553,12 @@ def flaky(): assert dict(tries[2].attributes)["ydb.retry.backoff_ms"] == expected_ms assert sleeps == [expected_ms / 1000.0, expected_ms / 1000.0] - def test_skip_backoff_errors_still_emit_one_try_per_attempt(self, otel_setup): + def test_skip_backoff_errors_still_emit_one_try_per_attempt(self, tracing_setup): """Aborted/BadSession path skips the inter-attempt sleep but must still rotate ydb.Try spans.""" from ydb import issues from ydb.retries import RetrySettings, retry_operation_sync - exporter = otel_setup + exporter = tracing_setup counter = {"n": 0} def flaky(): @@ -581,11 +581,11 @@ def flaky(): assert dict(tries[1].attributes)["ydb.retry.backoff_ms"] == 0 assert dict(tries[2].attributes)["ydb.retry.backoff_ms"] == 0 - def test_non_retryable_error_propagates_to_run_span(self, otel_setup): + def test_non_retryable_error_propagates_to_run_span(self, tracing_setup): from ydb import issues from ydb.retries import retry_operation_sync - exporter = otel_setup + exporter = tracing_setup def broken(): raise issues.SchemeError("boom") @@ -603,12 +603,12 @@ def broken(): assert attrs["error.type"] == "ydb_error" assert attrs["db.response.status_code"] == "SCHEME_ERROR" - def test_execute_query_is_child_of_try_under_run_with_retry(self, otel_setup): + def test_execute_query_is_child_of_try_under_run_with_retry(self, tracing_setup): """``ydb.RunWithRetry`` -> ``ydb.Try`` -> ``ydb.ExecuteQuery`` (sync path).""" from ydb.query.session import QuerySession from ydb.retries import retry_operation_sync - exporter = otel_setup + exporter = tracing_setup qs = QuerySession.__new__(QuerySession) cfg = FakeDriverConfig() diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index f0ffb9e63..547b458e0 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -2,7 +2,6 @@ import asyncio import logging -import itertools import time from typing import ( Callable, @@ -25,6 +24,7 @@ from ... import convert from ... import issues from ...opentelemetry.metrics import ( + next_query_session_pool_name, record_query_session_count, record_query_session_create_time, record_query_session_max, @@ -36,8 +36,6 @@ logger = logging.getLogger(__name__) -_pool_name_counter = itertools.count(1) - class QuerySessionPool: """QuerySessionPool is an object to simplify operations with sessions of Query Service.""" @@ -65,7 +63,7 @@ def __init__( self._current_size = 0 self._loop = asyncio.get_running_loop() if loop is None else loop self._query_client_settings = query_client_settings - self._metrics_pool_name = name or "query-session-pool-%d" % next(_pool_name_counter) + self._metrics_pool_name = name or next_query_session_pool_name() record_query_session_max(self._size, self._metrics_pool_name) async def _create_new_session(self): diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 9d647942a..2c610dc4f 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -2,6 +2,7 @@ import time import threading +import itertools from typing import Any, Dict, Optional CLIENT_OPERATION_DURATION = "db.client.operation.duration" @@ -15,6 +16,8 @@ RETRY_DURATION = "ydb.client.retry.duration" _UNKNOWN_POOL = "unknown" +_pool_name_counter = itertools.count(1) +_pool_name_lock = threading.Lock() _OPERATION_ATTR_KEYS = frozenset( { "db.system.name", @@ -161,6 +164,11 @@ def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: return {"ydb.query.session.pool.name": pool_name or _UNKNOWN_POOL} +def next_query_session_pool_name() -> str: + with _pool_name_lock: + return "query-session-pool-%d" % next(_pool_name_counter) + + def _operation_attrs(operation_name: str, attributes: Dict[str, Any]) -> Dict[str, Any]: return { "db.system.name": attributes.get("db.system.name", "ydb"), diff --git a/ydb/query/pool.py b/ydb/query/pool.py index 8445769ef..cfa45aded 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -14,7 +14,6 @@ import time import threading import queue -import itertools from .base import BaseQueryTxMode, QueryExplainResultFormat from .base import QueryClientSettings @@ -29,6 +28,7 @@ from .. import convert from ..settings import BaseRequestSettings from ..opentelemetry.metrics import ( + next_query_session_pool_name, record_query_session_count, record_query_session_create_time, record_query_session_max, @@ -42,8 +42,6 @@ logger = logging.getLogger(__name__) -_pool_name_counter = itertools.count(1) - class QuerySessionPool: """QuerySessionPool is an object to simplify operations with sessions of Query Service.""" @@ -75,7 +73,7 @@ def __init__( self._should_stop = threading.Event() self._lock = threading.RLock() self._query_client_settings = query_client_settings - self._metrics_pool_name = name or "query-session-pool-%d" % next(_pool_name_counter) + self._metrics_pool_name = name or next_query_session_pool_name() record_query_session_max(self._size, self._metrics_pool_name) def _create_new_session(self, timeout: Optional[float]): From 89954635a5c047871774291e55eda6a0e4ab8aca Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 01:34:04 +0300 Subject: [PATCH 09/30] fix docker example --- examples/opentelemetry/Dockerfile | 2 ++ ydb/opentelemetry/metrics.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/opentelemetry/Dockerfile b/examples/opentelemetry/Dockerfile index 326721a1c..17f646be9 100644 --- a/examples/opentelemetry/Dockerfile +++ b/examples/opentelemetry/Dockerfile @@ -6,6 +6,8 @@ FROM python:3.11-slim +ENV PYTHONUNBUFFERED=1 + WORKDIR /app # Dependency layer: copy only what setup.py needs so changes to the demo script do diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 2c610dc4f..de329477f 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -229,7 +229,7 @@ def set_attribute(self, key: str, value: Any) -> None: if key in _OPERATION_ATTR_KEYS: self._attributes[key] = value - def attach_context(self, end_on_exit=True) -> _MetricsOperationContext: + def attach_context(self, end_on_exit=True) -> "_MetricsOperationContext": return _MetricsOperationContext(self, end_on_exit) def end(self) -> None: @@ -261,7 +261,7 @@ def __init__(self, operation: MetricsOperation, end_on_exit: bool) -> None: self._operation = operation self._end_on_exit = end_on_exit - def __enter__(self) -> MetricsOperation: + def __enter__(self) -> "MetricsOperation": return self._operation def __exit__(self, exc_type, exc_val, exc_tb) -> bool: From da3f3139e328d61c54564879357b986f5101eada Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 02:22:33 +0300 Subject: [PATCH 10/30] add docs --- docs/index.rst | 4 + docs/opentelemetry.rst | 174 ++++++++++++++++++++++--- examples/opentelemetry/README.md | 12 +- examples/opentelemetry/otel_example.py | 4 +- tests/opentelemetry/conftest.py | 8 +- ydb/opentelemetry/__init__.py | 21 ++- ydb/opentelemetry/metrics.py | 71 ++++------ ydb/opentelemetry/plugin.py | 9 +- ydb/opentelemetry/tracing.py | 6 +- 9 files changed, 221 insertions(+), 88 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index cbe2c5ddb..e64ef885d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -117,6 +117,10 @@ application using OpenTelemetry. One call to ``enable_tracing()`` instruments query sessions, transactions, and connection pool operations — so you can visualize request flow in Jaeger, Grafana, or any OpenTelemetry-compatible backend. +The same page also covers client-side metrics. ``enable_metrics()`` exposes operation +latency, retry cost, and query session pool metrics through an OpenTelemetry +``MeterProvider``. + API Reference ------------- diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index c4eb810e8..fa2ce4834 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -1,14 +1,19 @@ -OpenTelemetry Tracing -===================== - -The SDK provides built-in distributed tracing via `OpenTelemetry `_. -When enabled, key YDB operations — such as session creation, query execution, transaction -commit/rollback, and driver initialization — produce OpenTelemetry spans. Trace -context is automatically propagated to the YDB server through gRPC metadata using the +OpenTelemetry +============= + +The SDK provides built-in distributed tracing and client-side metrics via +`OpenTelemetry `_. When tracing is enabled, key YDB +operations — such as session creation, query execution, transaction commit/rollback, +and driver initialization — produce OpenTelemetry spans. Trace context is automatically +propagated to the YDB server through gRPC metadata using the `W3C Trace Context `_ standard. -Tracing is **zero-cost when disabled**: the SDK uses no-op stubs by default, so there is -no overhead unless you explicitly opt in. +Metrics expose operation latency/failures, retry cost, and query session pool state. +Tracing and metrics are configured independently: enabling one does not require enabling +the other. + +Instrumentation is **zero-cost when disabled**: the SDK uses no-op stubs by default, so +there is no OpenTelemetry overhead unless you explicitly opt in. Installation @@ -22,7 +27,7 @@ OpenTelemetry packages are not included by default. Install the SDK with the pip install ydb[opentelemetry] This pulls in ``opentelemetry-api``. You will also need ``opentelemetry-sdk`` and an -exporter for your tracing backend, for example: +exporter for your tracing or metrics backend, for example: .. code-block:: sh @@ -73,6 +78,52 @@ Repeated calls to ``enable_tracing()`` do nothing until you call ``disable_traci which removes hooks so you can reconfigure or turn instrumentation off. +Enabling Metrics +---------------- + +Call ``enable_metrics()`` once, after configuring your OpenTelemetry meter provider +and before creating YDB drivers or query session pools: + +.. code-block:: python + + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader + from opentelemetry.sdk.resources import Resource + + import ydb + from ydb.opentelemetry import enable_metrics + + # 1. Set up OpenTelemetry + resource = Resource(attributes={"service.name": "my-service"}) + metric_reader = PeriodicExportingMetricReader( + OTLPMetricExporter(endpoint="http://localhost:4317"), + export_interval_millis=1000, + ) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + + # 2. Enable YDB metrics + enable_metrics(meter_provider) + + # 3. Use the SDK as usual — metrics are recorded automatically + with ydb.Driver(endpoint="grpc://localhost:2136", database="/local") as driver: + driver.wait(timeout=5) + with ydb.QuerySessionPool(driver, name="main-pool") as pool: + pool.execute_with_retries("SELECT 1") + + meter_provider.shutdown() + +``enable_metrics()`` accepts an optional ``meter_provider`` argument. If omitted, the +SDK obtains a meter named ``"ydb.sdk"`` from the global meter provider. + +Repeated calls to ``enable_metrics()`` do nothing until you call +``disable_metrics()``, which clears the in-memory observable metric values and allows +metrics to be reconfigured. + +Metrics are independent from tracing. If both ``enable_tracing()`` and +``enable_metrics()`` are called, YDB client operations produce both spans and metrics. + + What Is Instrumented -------------------- @@ -171,6 +222,85 @@ On errors, the span also records: - ``db.response.status_code`` — the YDB status code name (e.g. ``"SCHEME_ERROR"``). +Metric Instruments +------------------ + +The SDK creates the following instruments with meter name ``"ydb.sdk"``: + +.. list-table:: + :header-rows: 1 + :widths: 30 15 15 40 + + * - Metric + - Instrument + - Unit + - Description + * - ``db.client.operation.duration`` + - Histogram + - ``s`` + - Latency of user-visible YDB client operations. + * - ``ydb.client.operation.failed`` + - Counter + - ``{command}`` + - Failed user-visible YDB client operations. + * - ``ydb.query.session.create_time`` + - Histogram + - ``s`` + - Time spent creating a query session. + * - ``ydb.query.session.pending_requests`` + - UpDownCounter + - ``{request}`` + - Requests currently waiting for a session from the pool. + * - ``ydb.query.session.timeouts`` + - Counter + - ``{connection}`` + - Session acquisition timeouts. + * - ``ydb.query.session.count`` + - ObservableUpDownCounter + - ``{connection}`` + - Current number of open query sessions by pool and state. + * - ``ydb.query.session.max`` + - ObservableUpDownCounter + - ``{connection}`` + - Maximum configured number of sessions for a query session pool. + * - ``ydb.client.retry.duration`` + - Histogram + - ``s`` + - Total user-visible duration of a logical retried operation, including attempts and backoff. + * - ``ydb.client.retry.attempts`` + - Histogram + - ``{attempt}`` + - Number of attempts performed for one logical retried operation. + +Operation metrics use stable labels only: + +.. list-table:: + :header-rows: 1 + :widths: 35 65 + + * - Attribute + - Description + * - ``db.system.name`` + - Always ``"ydb"``. + * - ``db.namespace`` + - Database path. + * - ``server.address`` + - Host from the configured endpoint. + * - ``server.port`` + - Port from the configured endpoint. + * - ``ydb.operation.name`` + - SDK operation name, for example ``"ydb.ExecuteQuery"``. + * - ``db.response.status_code`` + - Added only to ``ydb.client.operation.failed``. + +Query session metrics use ``ydb.query.session.pool.name``. The pool name is generated +automatically, or can be set explicitly with ``QuerySessionPool(..., name="main-pool")`` +for both synchronous and asynchronous pools. ``ydb.query.session.count`` also includes +``ydb.query.session.state`` with values ``"idle"`` or ``"used"``. + +Retry metrics are recorded without attributes. + + Trace Context Propagation ------------------------- @@ -189,16 +319,17 @@ request path. Async Usage ----------- -Tracing works identically with the async driver. Call ``enable_tracing()`` once at -startup: +Tracing and metrics work identically with the async driver. Call +``enable_tracing()`` and/or ``enable_metrics()`` once at startup: .. code-block:: python import asyncio import ydb - from ydb.opentelemetry import enable_tracing + from ydb.opentelemetry import enable_metrics, enable_tracing enable_tracing() + enable_metrics() async def main(): async with ydb.aio.Driver( @@ -206,7 +337,7 @@ startup: database="/local", ) as driver: await driver.wait(timeout=5) - async with ydb.aio.QuerySessionPool(driver) as pool: + async with ydb.aio.QuerySessionPool(driver, name="async-main-pool") as pool: await pool.execute_with_retries("SELECT 1") asyncio.run(main()) @@ -229,12 +360,14 @@ To use a specific tracer instead of the global one: Running the Examples -------------------- -The runnable script is ``examples/opentelemetry/otel_example.py`` (bank table + concurrent -Serializable transactions and ``app_startup`` / ``example_tli`` application spans). **Start -Docker (YDB or the full stack) first**, then install and run on the host — see -``examples/opentelemetry/README.md`` for the full order of commands and environment variables. +The runnable script is ``examples/opentelemetry/otel_example.py``. It demonstrates both +tracing and metrics: bank table + concurrent Serializable transactions, +``app_startup`` / ``example_tli`` application spans, and SDK metrics exported through +OTLP. **Start Docker (YDB or the full stack) first**, then install and run on the host +— see ``examples/opentelemetry/README.md`` for the full order of commands and +environment variables. -**Full stack in one command** (YDB + OTLP + Tempo + Grafana; the ``otel-example`` service is built from ``examples/opentelemetry/Dockerfile`` and runs the script once): +**Full stack in one command** (YDB + OTLP + Tempo + Grafana + Prometheus; the ``otel-example`` service is built from ``examples/opentelemetry/Dockerfile`` and runs the script once): .. code-block:: sh @@ -250,4 +383,5 @@ The first run builds the ``otel-example`` image from the local SDK source; subse pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt python examples/opentelemetry/otel_example.py -Open `http://localhost:3000 `_ (Grafana) to explore traces via Tempo. +Open `http://localhost:3000 `_ (Grafana) to explore traces via +Tempo and metrics through the configured Prometheus data source. diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index 1af90f6d3..d524df518 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -1,7 +1,8 @@ # OpenTelemetry example (YDB Python SDK) Async demo in [`otel_example.py`](otel_example.py): OTLP export, `enable_tracing()`, -`app_startup` and `example_tli` application spans, bank table, Serializable transactions (TLI-style load). +`enable_metrics()`, `app_startup` and `example_tli` application spans, SDK client +metrics, bank table, Serializable transactions (TLI-style load). Most steps assume the **repository root** as the current directory; the install step also shows the variant from this folder. @@ -17,7 +18,7 @@ docker compose up -d # wait until the ydb container is healthy / port 2136 is open, then continue ``` -**Full stack** (YDB + OTLP collector + Tempo + Grafana; the `otel-example` service is built from a `Dockerfile` and runs the script once inside Compose). The compose file is `compose-e2e.yaml` next to this README. +**Full stack** (YDB + OTLP collector + Tempo + Prometheus + Grafana; the `otel-example` service is built from a `Dockerfile` and runs the script once inside Compose). The compose file is `compose-e2e.yaml` next to this README. ```sh cd /path/to/ydb-python-sdk @@ -34,6 +35,11 @@ docker compose -f compose-e2e.yaml up --build The first run builds the `otel-example` image from the local SDK source (`Dockerfile` in this folder, `.dockerignore` at the repo root keeps the context small). Subsequent runs reuse the cached image; pass `--build` if you change the SDK or the demo script. Grafana: http://localhost:3000 +Prometheus: http://localhost:9090 + +Use Grafana to explore traces through Tempo and metrics through Prometheus. In +Prometheus, SDK metric names are exposed in Prometheus format; search by prefixes such +as `db_client_operation_duration` and `ydb_query_session_count`. **Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/compose-e2e.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. @@ -63,7 +69,7 @@ pip install -e '../..[opentelemetry]' -r requirements.txt python examples/opentelemetry/otel_example.py ``` -Defaults: YDB `grpc://localhost:2136`, OTLP `http://localhost:4317` (for a local collector, if you use one). +Defaults: YDB `grpc://localhost:2136`, OTLP `http://localhost:4317` (for a local collector, if you use one). The same OTLP endpoint receives both traces and metrics. ## Environment (Docker / overrides) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index db6f6089e..3bdd6a6b8 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -14,7 +14,7 @@ from opentelemetry.sdk.metrics._internal.export import PeriodicExportingMetricReader import ydb -from ydb.opentelemetry import enable_tracing, enable_registry, disable_registry +from ydb.opentelemetry import enable_metrics, enable_tracing from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource @@ -62,7 +62,7 @@ async def main() -> None: export_interval_millis=1000, ) meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) - enable_registry(meter_provider) + enable_metrics(meter_provider) async with ydb.aio.Driver( endpoint=endpoint, diff --git a/tests/opentelemetry/conftest.py b/tests/opentelemetry/conftest.py index 854f8ccd3..7518dd213 100644 --- a/tests/opentelemetry/conftest.py +++ b/tests/opentelemetry/conftest.py @@ -42,17 +42,17 @@ def tracing_setup(): @pytest.fixture() def metrics_setup(): """Enable SDK metrics with an in-memory reader, then restore noop defaults.""" - from ydb.opentelemetry import disable_registry, enable_registry + from ydb.opentelemetry import disable_metrics, enable_metrics reader = InMemoryMetricReader() provider = MeterProvider(metric_readers=[reader]) - disable_registry() - enable_registry(provider) + disable_metrics() + enable_metrics(provider) try: yield reader finally: - disable_registry() + disable_metrics() provider.shutdown() diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index e29a577d6..15c1e3cad 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -33,12 +33,16 @@ def disable_tracing(): _disable_tracing() -def enable_registry(meter_provider=None): +def enable_metrics(meter_provider=None): """Enable OpenTelemetry metrics collection for YDB SDK client metrics. + This call is **idempotent**: if metrics are already enabled, later calls do nothing + (including passing a different ``meter_provider``). Call :func:`disable_metrics` + first to reconfigure or turn instrumentation off. + Args: - meter_provider: Optional OpenTelemetry MeterProvider. If not provided, - the global OpenTelemetry meter provider is used. + meter_provider: Optional OTel meter provider to use. If not provided, the + default meter named ``ydb.sdk`` from the global meter provider will be used. """ try: from ydb.opentelemetry.plugin import _enable_metrics @@ -51,8 +55,8 @@ def enable_registry(meter_provider=None): _enable_metrics(meter_provider) -def disable_registry(): - """Disable YDB OpenTelemetry metrics collection and allow :func:`enable_registry` to run again.""" +def disable_metrics(): + """Disable YDB OpenTelemetry metrics collection and allow :func:`enable_metrics` to run again.""" try: from ydb.opentelemetry.plugin import _disable_metrics except ImportError: @@ -61,4 +65,9 @@ def disable_registry(): _disable_metrics() -__all__ = ["disable_tracing", "enable_tracing", "disable_registry", "enable_registry"] +__all__ = [ + "disable_tracing", + "enable_tracing", + "disable_metrics", + "enable_metrics", +] diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index de329477f..c23ee20c8 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -1,4 +1,9 @@ -"""No-op-safe helpers for YDB OpenTelemetry metrics.""" +"""No-op-safe helpers for YDB OpenTelemetry client metrics. + +The SDK records metrics only after :func:`ydb.opentelemetry.enable_metrics` +installs OpenTelemetry instruments. Until then every helper is a cheap no-op, +which keeps metrics independent from tracing and safe to call from hot paths. +""" import time import threading @@ -30,6 +35,13 @@ class MetricsRegistry: + """Process-wide metric instrument registry. + + Regular instruments are recorded immediately. Observable query-session + instruments keep their latest values in memory and expose snapshots through + callbacks registered by ``ydb.opentelemetry.plugin``. + """ + def __init__(self) -> None: self._instruments: Dict[str, Any] = {} self._query_session_count_values: Dict[Any, int] = {} @@ -105,30 +117,13 @@ def clear(self) -> None: self._query_session_max_values = {} def add(self, name: str, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: - """ - Record a metric value, accumulating for observable metrics or adding directly for others. - - For observable metrics, values are accumulated by attributes and sent via callback. - For regular metrics, values are added immediately to the instrument. - - Args: - name: Name of the metric. - value: Value to add (positive or negative). - attributes: Optional dictionary of metric attributes (labels). - """ + """Add ``value`` to a counter-like instrument if metrics are enabled.""" instrument = self._instruments.get(name) if instrument is not None: instrument.add(value, attributes=attributes or {}) def record(self, name: str, value: float, attributes: Optional[Dict[str, Any]] = None) -> None: - """ - Record a histogram or gauge metric value. - - Args: - name: Name of the metric. - value: Value to record. - attributes: Optional dictionary of metric attributes (labels). - """ + """Record ``value`` in a histogram-like instrument if metrics are enabled.""" instrument = self._instruments.get(name) if instrument is not None: instrument.record(value, attributes=attributes or {}) @@ -165,6 +160,7 @@ def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: def next_query_session_pool_name() -> str: + """Return a process-unique default query session pool name for metric labels.""" with _pool_name_lock: return "query-session-pool-%d" % next(_pool_name_counter) @@ -187,28 +183,15 @@ def _response_status_code(exception: BaseException) -> str: class MetricsOperation: - """ - Context manager for tracking metrics of a single YDB operation. - - Records operation duration and captures errors. When the operation ends, - metrics are recorded to the registry with operation attributes. + """Metric lifecycle object for one user-visible YDB client operation. - Attributes: - _name: Name of the operation. - _attributes: Dictionary of attributes attached to all metrics from this operation. - _start_time: Timestamp when the operation started (using monotonic). - _exception: Optional exception that occurred during operation execution. - _ended: Flag to ensure metrics are recorded only once. + ``MetricsOperation`` mirrors the small span-like interface used by tracing + so both can be composed by ``create_ydb_span``. It records operation + duration once, records a failed-operation counter when an exception is + attached, and accepts only stable operation labels. """ def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None) -> None: - """ - Initialize a metrics operation. - - Args: - name: Name of the operation (e.g., 'ExecuteQuery', 'CreateSession'). - attributes: Optional dictionary of initial attributes for the operation. - """ self._name = name self._attributes = _operation_attrs(name, attributes or {}) self._start_time = time.monotonic() @@ -217,15 +200,11 @@ def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None) -> No self._end_lock = threading.Lock() def set_error(self, exception: BaseException) -> None: - """ - Record an exception that occurred during the operation. - - Args: - exception: The exception to record. - """ + """Remember the operation exception for the failed-operation metric.""" self._exception = exception def set_attribute(self, key: str, value: Any) -> None: + """Set a metric label only when it is part of the operation metric contract.""" if key in _OPERATION_ATTR_KEYS: self._attributes[key] = value @@ -257,11 +236,13 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> bool: class _MetricsOperationContext: + """Context manager that optionally leaves ``end()`` to a streaming result iterator.""" + def __init__(self, operation: MetricsOperation, end_on_exit: bool) -> None: self._operation = operation self._end_on_exit = end_on_exit - def __enter__(self) -> "MetricsOperation": + def __enter__(self) -> MetricsOperation: return self._operation def __exit__(self, exc_type, exc_val, exc_tb) -> bool: diff --git a/ydb/opentelemetry/plugin.py b/ydb/opentelemetry/plugin.py index 0a6983135..7aa7ca510 100644 --- a/ydb/opentelemetry/plugin.py +++ b/ydb/opentelemetry/plugin.py @@ -2,16 +2,14 @@ from opentelemetry import context as otel_context from opentelemetry import trace +from opentelemetry import metrics as otel_metrics from opentelemetry.metrics import Observation -from opentelemetry import metrics from opentelemetry.propagate import inject from opentelemetry.trace import StatusCode from ydb import issues from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry import metrics - -from ydb.opentelemetry.metrics import _metrics_registry, create_metrics_operation +from ydb.opentelemetry.metrics import _metrics_registry from ydb.opentelemetry.tracing import _registry as _tracing_registry # YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. @@ -161,13 +159,14 @@ def _create_query_session_max_callback(): def _enable_metrics(meter_provider): + """Create SDK metric instruments from an OTel MeterProvider and enable recording.""" global _meter if _meter is not None: return if meter_provider is None: - _meter = metrics.get_meter("ydb.sdk") + _meter = otel_metrics.get_meter("ydb.sdk") elif hasattr(meter_provider, "get_meter"): _meter = meter_provider.get_meter("ydb.sdk") else: diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index d006b0523..d601fa204 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -1,4 +1,4 @@ -"""Internal SDK tracing helpers and registry.""" +"""Internal SDK tracing helpers and telemetry facade.""" import enum from typing import Optional, Tuple @@ -52,7 +52,7 @@ def attach_context(self, end_on_exit=True): class _TelemetryContext: - """Attach both tracing and metrics lifecycle contexts for one SDK operation.""" + """Attach tracing context and metrics lifecycle for one SDK operation.""" def __init__(self, telemetry, span_context, metrics_context): self._telemetry = telemetry @@ -71,7 +71,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): class _TelemetryOperation: - """Operation telemetry facade that fans lifecycle events out to tracing and metrics.""" + """Span-like facade that forwards lifecycle events to tracing and metrics.""" def __init__(self, span, metrics): self._span = span From b8ca87b4d8979c54c0241bfe0ab0570be70e7abc Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 02:35:55 +0300 Subject: [PATCH 11/30] format --- ydb/opentelemetry/metrics.py | 6 +++--- ydb/query/session.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index c23ee20c8..968825862 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -8,7 +8,7 @@ import time import threading import itertools -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional CLIENT_OPERATION_DURATION = "db.client.operation.duration" CLIENT_OPERATION_FAILED = "ydb.client.operation.failed" @@ -228,7 +228,7 @@ def end(self) -> None: def __enter__(self) -> "MetricsOperation": return self - def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + def __exit__(self, exc_type, exc_val, exc_tb): if exc_val is not None: self.set_error(exc_val) self.end() @@ -245,7 +245,7 @@ def __init__(self, operation: MetricsOperation, end_on_exit: bool) -> None: def __enter__(self) -> MetricsOperation: return self._operation - def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + def __exit__(self, exc_type, exc_val, exc_tb): if exc_val is not None: self._operation.set_error(exc_val) self._operation.end() diff --git a/ydb/query/session.py b/ydb/query/session.py index 28dc80c06..78d229ab2 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -20,7 +20,6 @@ from .. import _apis, issues, _utilities from ..opentelemetry.tracing import SpanName, create_ydb_span, set_peer_attributes, span_finish_callback from ..opentelemetry.metrics import record_query_session_count -from ..opentelemetry.tracing import create_ydb_span, set_peer_attributes from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -97,6 +96,8 @@ class BaseQuerySession(abc.ABC, Generic[DriverT]): _closed: bool = False _invalidated: bool = False _metrics_counted: bool = False + _metrics_pool_name: Optional[str] = None + _metrics_state: str = "used" def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] = None): self._driver = driver @@ -110,6 +111,8 @@ def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] self._last_query_stats = None self._metrics_counted = False + self._metrics_pool_name = None + self._metrics_state = "used" @property def _driver_config(self) -> Optional["DriverConfig"]: From 850679ae593615b10b8ae712568bfb71b77f3902 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 02:50:47 +0300 Subject: [PATCH 12/30] fix tests --- tests/opentelemetry/conftest.py | 11 ++++++++++- ydb/opentelemetry/metrics.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/opentelemetry/conftest.py b/tests/opentelemetry/conftest.py index 7518dd213..e28fe305a 100644 --- a/tests/opentelemetry/conftest.py +++ b/tests/opentelemetry/conftest.py @@ -8,6 +8,8 @@ from opentelemetry import trace from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics import Counter, Histogram, ObservableUpDownCounter, UpDownCounter +from opentelemetry.sdk.metrics.export import AggregationTemporality from opentelemetry.sdk.metrics.export import InMemoryMetricReader from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import SimpleSpanProcessor @@ -44,7 +46,14 @@ def metrics_setup(): """Enable SDK metrics with an in-memory reader, then restore noop defaults.""" from ydb.opentelemetry import disable_metrics, enable_metrics - reader = InMemoryMetricReader() + reader = InMemoryMetricReader( + preferred_temporality={ + Counter: AggregationTemporality.CUMULATIVE, + Histogram: AggregationTemporality.CUMULATIVE, + ObservableUpDownCounter: AggregationTemporality.CUMULATIVE, + UpDownCounter: AggregationTemporality.CUMULATIVE, + } + ) provider = MeterProvider(metric_readers=[reader]) disable_metrics() diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 968825862..bf9047dae 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -8,7 +8,7 @@ import time import threading import itertools -from typing import Any, Dict, Literal, Optional +from typing import Any, Dict, Optional CLIENT_OPERATION_DURATION = "db.client.operation.duration" CLIENT_OPERATION_FAILED = "ydb.client.operation.failed" From 986231b84bab166c67ff78ce25f13fd756907ac2 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 03:02:06 +0300 Subject: [PATCH 13/30] fix tests --- tests/opentelemetry/test_metrics.py | 44 +++++++++++++++++++---------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 0605e98d6..f5cfd6a37 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -17,7 +17,11 @@ def _metrics_by_name(reader): def _single_point(reader, name): - metric = _metrics_by_name(reader)[name] + return _single_point_from_metrics(_metrics_by_name(reader), name) + + +def _single_point_from_metrics(metrics, name): + metric = metrics[name] points = list(metric.data.data_points) assert len(points) == 1 return points[0] @@ -304,16 +308,20 @@ def test_query_session_helpers_record_pool_attributes(metrics_setup): record_query_session_pending_requests(1, None) record_query_session_timeout("main") - assert _histogram_sum(metrics_setup, QUERY_SESSION_CREATE_TIME) == 0.5 - assert _single_point(metrics_setup, QUERY_SESSION_CREATE_TIME).attributes == {"ydb.query.session.pool.name": "main"} - assert _sum_value(metrics_setup, QUERY_SESSION_PENDING_REQUESTS) == 1 - assert _single_point(metrics_setup, QUERY_SESSION_PENDING_REQUESTS).attributes == { - "ydb.query.session.pool.name": "unknown" - } - assert _sum_value(metrics_setup, QUERY_SESSION_TIMEOUTS) == 1 - assert _single_point(metrics_setup, QUERY_SESSION_TIMEOUTS).attributes == {"ydb.query.session.pool.name": "main"} - assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 100 - assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "main"} + metrics = _metrics_by_name(metrics_setup) + create_time = _single_point_from_metrics(metrics, QUERY_SESSION_CREATE_TIME) + pending_requests = _single_point_from_metrics(metrics, QUERY_SESSION_PENDING_REQUESTS) + timeouts = _single_point_from_metrics(metrics, QUERY_SESSION_TIMEOUTS) + session_max = _single_point_from_metrics(metrics, QUERY_SESSION_MAX) + + assert create_time.sum == 0.5 + assert create_time.attributes == {"ydb.query.session.pool.name": "main"} + assert pending_requests.value == 1 + assert pending_requests.attributes == {"ydb.query.session.pool.name": "unknown"} + assert timeouts.value == 1 + assert timeouts.attributes == {"ydb.query.session.pool.name": "main"} + assert session_max.value == 100 + assert session_max.attributes == {"ydb.query.session.pool.name": "main"} def test_sync_query_session_pool_records_max(metrics_setup): @@ -368,11 +376,14 @@ def flaky(): assert retry_operation_sync(flaky, RetrySettings(max_retries=5)) == "ok" - duration = _single_point(metrics_setup, RETRY_DURATION) + metrics = _metrics_by_name(metrics_setup) + duration = _single_point_from_metrics(metrics, RETRY_DURATION) + retry_attempts = _single_point_from_metrics(metrics, RETRY_ATTEMPTS) + assert duration.count == 1 assert duration.sum >= 0 assert duration.attributes == {} - assert _histogram_sum(metrics_setup, RETRY_ATTEMPTS) == 3 + assert retry_attempts.sum == 3 async def _async_value(): @@ -386,8 +397,11 @@ async def test_retry_operation_async_records_retry_metrics(metrics_setup): assert await retry_operation_async(_async_value) == "ok" - duration = _single_point(metrics_setup, RETRY_DURATION) + metrics = _metrics_by_name(metrics_setup) + duration = _single_point_from_metrics(metrics, RETRY_DURATION) + retry_attempts = _single_point_from_metrics(metrics, RETRY_ATTEMPTS) + assert duration.count == 1 assert duration.sum >= 0 assert duration.attributes == {} - assert _histogram_sum(metrics_setup, RETRY_ATTEMPTS) == 1 + assert retry_attempts.sum == 1 From 7d5eec23cc86558283cc9f2afd7a98885d3e5e8c Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 16:46:48 +0300 Subject: [PATCH 14/30] make metrics zero-cost --- ydb/opentelemetry/metrics.py | 100 ++++++++++++++++++++++++++++++++--- ydb/opentelemetry/plugin.py | 15 ++++-- ydb/opentelemetry/tracing.py | 4 +- 3 files changed, 104 insertions(+), 15 deletions(-) diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index bf9047dae..034bfc131 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -46,7 +46,7 @@ def __init__(self) -> None: self._instruments: Dict[str, Any] = {} self._query_session_count_values: Dict[Any, int] = {} self._query_session_max_values: Dict[Any, int] = {} - self._query_session_count_lock = threading.Lock() + self._observable_values_lock = threading.Lock() def set_meter( self, @@ -112,7 +112,7 @@ def set_meter( def clear(self) -> None: self._instruments = {} - with self._query_session_count_lock: + with self._observable_values_lock: self._query_session_count_values = {} self._query_session_max_values = {} @@ -131,28 +131,98 @@ def record(self, name: str, value: float, attributes: Optional[Dict[str, Any]] = def add_query_session_count(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: attrs = tuple(sorted((attributes or {}).items())) - with self._query_session_count_lock: + with self._observable_values_lock: new_value = self._query_session_count_values.get(attrs, 0) + value self._query_session_count_values.pop(attrs, None) self._query_session_count_values[attrs] = new_value def get_query_session_count_values(self) -> Dict[Any, int]: - with self._query_session_count_lock: + with self._observable_values_lock: return dict(self._query_session_count_values) def set_query_session_max(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: attrs = tuple(sorted((attributes or {}).items())) - with self._query_session_count_lock: + with self._observable_values_lock: self._query_session_max_values[attrs] = value def get_query_session_max_values(self) -> Dict[Any, int]: - with self._query_session_count_lock: + with self._observable_values_lock: return dict(self._query_session_max_values) -_metrics_registry = MetricsRegistry() +_metrics_registry: Optional[MetricsRegistry] = None + + +class _NoopMetricsOperation: + def set_error(self, exception: BaseException) -> None: + pass + + def set_attribute(self, key: str, value: Any) -> None: + pass + + def attach_context(self, end_on_exit=True) -> "_NoopMetricsOperationContext": + return _NoopMetricsOperationContext(self) + + def end(self) -> None: + pass + + def __enter__(self) -> "_NoopMetricsOperation": + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + +class _NoopMetricsOperationContext: + def __init__(self, operation: _NoopMetricsOperation) -> None: + self._operation = operation + + def __enter__(self) -> _NoopMetricsOperation: + return self._operation + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + +_NOOP_METRICS_OPERATION = _NoopMetricsOperation() + + +def is_metrics_enabled() -> bool: + return _metrics_registry is not None + + +def enable_metrics_registry( + meter: Any, + observe_query_session_count_callback: Any, + observe_query_session_max_callback: Any, +) -> None: + global _metrics_registry + + if _metrics_registry is None: + _metrics_registry = MetricsRegistry() + _metrics_registry.set_meter(meter, observe_query_session_count_callback, observe_query_session_max_callback) + + +def disable_metrics_registry() -> None: + global _metrics_registry + + if _metrics_registry is not None: + _metrics_registry.clear() + _metrics_registry = None + + +def get_query_session_count_values() -> Dict[Any, int]: + if _metrics_registry is None: + return {} + return _metrics_registry.get_query_session_count_values() + + +def get_query_session_max_values() -> Dict[Any, int]: + if _metrics_registry is None: + return {} + return _metrics_registry.get_query_session_max_values() def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: @@ -254,32 +324,46 @@ def __exit__(self, exc_type, exc_val, exc_tb): return False -def create_metrics_operation(name: str, attributes: Optional[Dict[str, Any]] = None) -> MetricsOperation: +def create_metrics_operation(name: str, attributes: Optional[Dict[str, Any]] = None): + if _metrics_registry is None: + return _NOOP_METRICS_OPERATION return MetricsOperation(name, attributes) def record_query_session_count(delta: int, pool_name: Optional[str] = None, state: str = "used") -> None: + if _metrics_registry is None: + return attrs = _pool_attrs(pool_name) attrs["ydb.query.session.state"] = state _metrics_registry.add_query_session_count(delta, attrs) def record_query_session_create_time(duration: float, pool_name: Optional[str]) -> None: + if _metrics_registry is None: + return _metrics_registry.record(QUERY_SESSION_CREATE_TIME, duration, _pool_attrs(pool_name)) def record_query_session_pending_requests(delta: int, pool_name: Optional[str]) -> None: + if _metrics_registry is None: + return _metrics_registry.add(QUERY_SESSION_PENDING_REQUESTS, delta, _pool_attrs(pool_name)) def record_query_session_timeout(pool_name: Optional[str]) -> None: + if _metrics_registry is None: + return _metrics_registry.add(QUERY_SESSION_TIMEOUTS, 1, _pool_attrs(pool_name)) def record_query_session_max(value: int, pool_name: Optional[str]) -> None: + if _metrics_registry is None: + return _metrics_registry.set_query_session_max(value, _pool_attrs(pool_name)) def record_retry_metrics(duration: float, attempts: int) -> None: + if _metrics_registry is None: + return _metrics_registry.record(RETRY_DURATION, duration) _metrics_registry.record(RETRY_ATTEMPTS, attempts) diff --git a/ydb/opentelemetry/plugin.py b/ydb/opentelemetry/plugin.py index 7aa7ca510..b0e2b722f 100644 --- a/ydb/opentelemetry/plugin.py +++ b/ydb/opentelemetry/plugin.py @@ -9,7 +9,12 @@ from ydb import issues from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry.metrics import _metrics_registry +from ydb.opentelemetry.metrics import ( + disable_metrics_registry, + enable_metrics_registry, + get_query_session_count_values, + get_query_session_max_values, +) from ydb.opentelemetry.tracing import _registry as _tracing_registry # YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. @@ -150,12 +155,12 @@ def observe(_): def _create_query_session_count_callback(): """Create callback for observable query session count metric.""" - return _create_observable_callback(_metrics_registry.get_query_session_count_values) + return _create_observable_callback(get_query_session_count_values) def _create_query_session_max_callback(): """Create callback for observable query session max metric.""" - return _create_observable_callback(_metrics_registry.get_query_session_max_values) + return _create_observable_callback(get_query_session_max_values) def _enable_metrics(meter_provider): @@ -172,12 +177,12 @@ def _enable_metrics(meter_provider): else: raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") - _metrics_registry.set_meter(_meter, _create_query_session_count_callback(), _create_query_session_max_callback()) + enable_metrics_registry(_meter, _create_query_session_count_callback(), _create_query_session_max_callback()) def _disable_metrics(): global _meter - _metrics_registry.clear() + disable_metrics_registry() if _meter is not None: _meter = None diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index d601fa204..262a99c53 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -3,7 +3,7 @@ import enum from typing import Optional, Tuple -from ydb.opentelemetry.metrics import create_metrics_operation +from ydb.opentelemetry.metrics import create_metrics_operation, is_metrics_enabled class SpanName(str, enum.Enum): @@ -191,7 +191,7 @@ def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): Tracing receives full operation context, including peer/node details. Metrics receive only the stable labels defined for client operation metrics. """ - metrics_attrs = _build_ydb_attrs(driver_config) + metrics_attrs = _build_ydb_attrs(driver_config) if is_metrics_enabled() else None tracing_attrs = _build_ydb_tracing_attrs(driver_config, node_id, peer) metrics = create_metrics_operation(name, metrics_attrs) return _TelemetryOperation(_registry.create_span(name, attributes=tracing_attrs, kind=kind), metrics) From 708becdf14a10e0fc1402f1b7b085c13c0f48349 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 16 May 2026 16:55:03 +0300 Subject: [PATCH 15/30] format --- ydb/opentelemetry/metrics.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 034bfc131..58e18129b 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -287,13 +287,17 @@ def end(self) -> None: return self._ended = True + registry = _metrics_registry + if registry is None: + return + duration = time.monotonic() - self._start_time - _metrics_registry.record(CLIENT_OPERATION_DURATION, duration, self._attributes) + registry.record(CLIENT_OPERATION_DURATION, duration, self._attributes) if self._exception is not None: attrs = dict(self._attributes) attrs["db.response.status_code"] = _response_status_code(self._exception) - _metrics_registry.add(CLIENT_OPERATION_FAILED, 1, attrs) + registry.add(CLIENT_OPERATION_FAILED, 1, attrs) def __enter__(self) -> "MetricsOperation": return self From 934489830d6a5156749fee35862bf6126d10c843 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sun, 17 May 2026 03:34:52 +0300 Subject: [PATCH 16/30] fix test --- tests/opentelemetry/test_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index f5cfd6a37..b0400d249 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -103,8 +103,6 @@ def test_metrics_registry_is_noop_without_meter(): record_retry_metrics, ) - _metrics_registry.clear() - record_query_session_create_time(1.0, "pool") record_query_session_pending_requests(1, "pool") record_query_session_timeout("pool") From a8dd9bc934334c28e726c4ebfafd4137d45ee085 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sun, 17 May 2026 03:36:30 +0300 Subject: [PATCH 17/30] format --- tests/opentelemetry/test_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index b0400d249..2a1bc2c73 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -95,7 +95,6 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): def test_metrics_registry_is_noop_without_meter(): from ydb.opentelemetry.metrics import ( - _metrics_registry, create_metrics_operation, record_query_session_create_time, record_query_session_pending_requests, From d1e5b1c74cf3a7fb9800bab98e60b56cbe49eef1 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sun, 17 May 2026 04:19:32 +0300 Subject: [PATCH 18/30] remove lock --- ydb/opentelemetry/metrics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 58e18129b..7c6679e96 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -231,8 +231,7 @@ def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: def next_query_session_pool_name() -> str: """Return a process-unique default query session pool name for metric labels.""" - with _pool_name_lock: - return "query-session-pool-%d" % next(_pool_name_counter) + return "query-session-pool-%d" % next(_pool_name_counter) def _operation_attrs(operation_name: str, attributes: Dict[str, Any]) -> Dict[str, Any]: From d6ab50dbac9631d7c09342b76b6accc8a27d1def Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 19 May 2026 16:59:22 +0300 Subject: [PATCH 19/30] fix async test --- tests/opentelemetry/test_tracing_async.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/opentelemetry/test_tracing_async.py b/tests/opentelemetry/test_tracing_async.py index af2b369b6..1276fa69d 100644 --- a/tests/opentelemetry/test_tracing_async.py +++ b/tests/opentelemetry/test_tracing_async.py @@ -509,10 +509,10 @@ def _make_session(): async def do_execute(qs): fake_stream = _slow_async_iter() - with patch.object(QuerySession, "_execute_call", new_callable=AsyncMock, return_value=fake_stream): - result = await qs.execute("SELECT 1") - async for _ in result: - pass + qs._execute_call = AsyncMock(return_value=fake_stream) + result = await qs.execute("SELECT 1") + async for _ in result: + pass qs1 = _make_session() qs2 = _make_session() From 7d154693895e02e1634a9165fa2cbaa36f0118dc Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 03:07:18 +0300 Subject: [PATCH 20/30] add load tank and grafana dashboard --- examples/opentelemetry/Dockerfile | 10 +- examples/opentelemetry/README.md | 66 +++- examples/opentelemetry/compose-e2e.yaml | 33 +- .../dashboards/ydb-python-sdk-metrics.json | 186 +++++++++++ .../provisioning/datasources/datasources.yaml | 6 +- examples/opentelemetry/load_tank.py | 290 ++++++++++++++++++ examples/opentelemetry/metrics_views.py | 42 +++ examples/opentelemetry/otel_example.py | 5 +- 8 files changed, 620 insertions(+), 18 deletions(-) create mode 100644 examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json create mode 100644 examples/opentelemetry/load_tank.py create mode 100644 examples/opentelemetry/metrics_views.py diff --git a/examples/opentelemetry/Dockerfile b/examples/opentelemetry/Dockerfile index 17f646be9..c981ee04d 100644 --- a/examples/opentelemetry/Dockerfile +++ b/examples/opentelemetry/Dockerfile @@ -1,6 +1,6 @@ -# Isolated image for the OpenTelemetry demo. Build context is the repository root. +# Isolated image for the OpenTelemetry demo scripts. Build context is the repository root. # -# docker compose -f examples/opentelemetry/compose-e2e.yaml build otel-example +# docker compose -f examples/opentelemetry/compose-e2e.yaml build # # A separate ``.dockerignore`` at the repo root keeps the context small. @@ -17,7 +17,7 @@ COPY ydb ./ydb COPY examples/opentelemetry/requirements.txt ./examples/opentelemetry/requirements.txt RUN pip install --no-cache-dir -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt -# Demo script. +# Demo scripts. COPY examples/opentelemetry/otel_example.py ./examples/opentelemetry/otel_example.py - -CMD ["python", "examples/opentelemetry/otel_example.py"] +COPY examples/opentelemetry/load_tank.py ./examples/opentelemetry/load_tank.py +COPY examples/opentelemetry/metrics_views.py ./examples/opentelemetry/metrics_views.py diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index d524df518..d1667ea8d 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -4,6 +4,13 @@ Async demo in [`otel_example.py`](otel_example.py): OTLP export, `enable_tracing `enable_metrics()`, `app_startup` and `example_tli` application spans, SDK client metrics, bank table, Serializable transactions (TLI-style load). +[`load_tank.py`](load_tank.py) runs a small step-like load profile for the +metrics dashboard: + +```text +Peak RPS -> Medium RPS -> Min RPS -> Medium RPS -> repeat +``` + Most steps assume the **repository root** as the current directory; the install step also shows the variant from this folder. ## 1. Start YDB (or the full stack) with Docker **first** @@ -18,7 +25,10 @@ docker compose up -d # wait until the ydb container is healthy / port 2136 is open, then continue ``` -**Full stack** (YDB + OTLP collector + Tempo + Prometheus + Grafana; the `otel-example` service is built from a `Dockerfile` and runs the script once inside Compose). The compose file is `compose-e2e.yaml` next to this README. +**Full stack** (YDB + OTLP collector + Tempo + Prometheus + Grafana; the +`otel-example` service runs the tracing/metrics demo once, and `load-generator` +runs the metrics load tank). The compose file is `compose-e2e.yaml` next to this +README. ```sh cd /path/to/ydb-python-sdk @@ -37,12 +47,25 @@ The first run builds the `otel-example` image from the local SDK source (`Docker Grafana: http://localhost:3000 Prometheus: http://localhost:9090 -Use Grafana to explore traces through Tempo and metrics through Prometheus. In -Prometheus, SDK metric names are exposed in Prometheus format; search by prefixes such -as `db_client_operation_duration` and `ydb_query_session_count`. +Grafana is provisioned with the **YDB Python SDK Metrics** dashboard. It uses +Prometheus queries for SDK metrics such as `db_client_operation_duration`, +`ydb_client_operation_failed`, `ydb_query_session_count`, +`ydb_query_session_pending_requests`, `ydb_query_session_create_time`, and +`ydb_client_retry_duration`. Use Grafana Explore for ad-hoc traces through Tempo +and metrics through Prometheus. + +The examples configure custom OpenTelemetry histogram views in +[`metrics_views.py`](metrics_views.py). The SDK records duration values in +seconds, but the default histogram buckets are too coarse for fast local YDB +operations. The custom views keep the `s` unit and use sub-millisecond / +millisecond-scale buckets so Grafana percentiles show meaningful latency +distributions. **Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/compose-e2e.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. +**Logs for `load-generator`:** the service is also one-shot. It runs for +`LOAD_TANK_TOTAL_TIME` seconds and then exits after flushing metrics. + ## 2. Install dependencies (on the host, for a local `python` run) **From the repository root** (editable SDK + pins from this example): @@ -71,10 +94,35 @@ python examples/opentelemetry/otel_example.py Defaults: YDB `grpc://localhost:2136`, OTLP `http://localhost:4317` (for a local collector, if you use one). The same OTLP endpoint receives both traces and metrics. +Run the load tank against an already running local stack: + +```sh +python examples/opentelemetry/load_tank.py +``` + ## Environment (Docker / overrides) -| Variable | Meaning | -|----------|---------| -| `YDB_ENDPOINT` | e.g. `grpc://ydb:2136` inside the Compose network | -| `YDB_DATABASE` | default `/local` | -| `OTEL_EXPORTER_OTLP_ENDPOINT` | e.g. `http://otel-collector:4317` | +| Variable | Meaning | +|----------|----------------------------------------------------------| +| `YDB_ENDPOINT` | e.g. `grpc://ydb:2136` inside the Compose network | +| `YDB_DATABASE` | default `/local` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | e.g. `http://otel-collector:4317` | +| `OTEL_SERVICE_NAME` | service name attached to exported telemetry | +| `LOAD_TANK_TOTAL_TIME` | total load duration in seconds, default `6000` | +| `LOAD_TANK_WORKERS` | number of concurrent workers, default `40` | +| `LOAD_TANK_POOL_SIZE` | query session pool size, default `20` | +| `LOAD_TANK_PEAK_RPS` | peak phase target RPS, default `120` | +| `LOAD_TANK_MEDIUM_RPS` | medium phase target RPS, default `30` | +| `LOAD_TANK_MIN_RPS` | low phase target RPS, default `3` | +| `LOAD_TANK_ERROR_RPS` | failed query target RPS, default `1`; set `0` to disable | +| `LOAD_TANK_PRESSURE_POOL_SIZE` | pool size for session pressure metrics, default `1` | +| `LOAD_TANK_PRESSURE_WORKERS` | concurrent contenders for the pressure pool, default `8` | +| `LOAD_TANK_PRESSURE_HOLD_TIME` | seconds to hold the pressure-pool session, default `1.5` | +| `LOAD_TANK_PRESSURE_ACQUIRE_TIMEOUT` | short acquire timeout for timeout metrics, default `1.0` | +| `LOAD_TANK_PRESSURE_INTERVAL` | pause between pressure rounds, default `0.2` | +| `LOAD_TANK_SESSION_CHURN_INTERVAL` | interval for creating fresh sessions, default `2.0` | +| `LOAD_TANK_PEAK_DURATION` | peak phase duration in seconds, default `60` | +| `LOAD_TANK_MEDIUM_DURATION` | medium phase duration in seconds, default `90` | +| `LOAD_TANK_MIN_DURATION` | low phase duration in seconds, default `60` | +| `LOAD_TANK_QUERY` | query executed by workers, default `SELECT 1 AS value` | +| `LOAD_TANK_ERROR_QUERY` | query used to produce failed-operation metrics | diff --git a/examples/opentelemetry/compose-e2e.yaml b/examples/opentelemetry/compose-e2e.yaml index f8402d50f..e0bf9a5e4 100644 --- a/examples/opentelemetry/compose-e2e.yaml +++ b/examples/opentelemetry/compose-e2e.yaml @@ -1,5 +1,5 @@ # Full OpenTelemetry demo: YDB (server-side tracing config), collector, Tempo, Prometheus, Grafana, -# and a one-shot container that runs otel_example.py once. +# a one-shot container that runs otel_example.py once, and a load generator for live metrics. # # Run from this directory (paths below are relative to this file): # cd examples/opentelemetry && docker compose -f compose-e2e.yaml up @@ -78,6 +78,7 @@ services: build: context: ../.. dockerfile: examples/opentelemetry/Dockerfile + command: ["python", "examples/opentelemetry/otel_example.py"] environment: YDB_ENDPOINT: grpc://ydb:2136 YDB_DATABASE: /local @@ -89,3 +90,33 @@ services: otel-collector: condition: service_started restart: "no" + + load-generator: + build: + context: ../.. + dockerfile: examples/opentelemetry/Dockerfile + command: ["python", "examples/opentelemetry/load_tank.py"] + environment: + YDB_ENDPOINT: grpc://ydb:2136 + YDB_DATABASE: /local + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_SERVICE_NAME: ydb-python-load-tank + LOAD_TANK_TOTAL_TIME: "6000" + LOAD_TANK_WORKERS: "40" + LOAD_TANK_POOL_SIZE: "20" + LOAD_TANK_PEAK_RPS: "120" + LOAD_TANK_MEDIUM_RPS: "30" + LOAD_TANK_MIN_RPS: "3" + LOAD_TANK_ERROR_RPS: "1" + LOAD_TANK_PRESSURE_POOL_SIZE: "1" + LOAD_TANK_PRESSURE_WORKERS: "8" + LOAD_TANK_PRESSURE_HOLD_TIME: "1.5" + LOAD_TANK_PRESSURE_ACQUIRE_TIMEOUT: "1.0" + LOAD_TANK_PRESSURE_INTERVAL: "0.2" + LOAD_TANK_SESSION_CHURN_INTERVAL: "2.0" + depends_on: + ydb: + condition: service_healthy + otel-collector: + condition: service_started + restart: "no" diff --git a/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json b/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json new file mode 100644 index 000000000..798b51717 --- /dev/null +++ b/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json @@ -0,0 +1,186 @@ +{ + "title": "YDB Python SDK Metrics", + "uid": "ydb-python-sdk-metrics", + "schemaVersion": 38, + "timezone": "browser", + "refresh": "5s", + "tags": ["ydb", "python", "opentelemetry"], + "panels": [ + { + "title": "Request Rate (RPS)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "reqps" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "sum by (ydb_operation_name) (rate(db_client_operation_duration_seconds_count[$__rate_interval]))", + "legendFormat": "{{ydb_operation_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Error Rate (RPS)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "reqps" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "sum by (db_response_status_code) (rate(ydb_client_operation_failed_total[$__rate_interval]))", + "legendFormat": "{{db_response_status_code}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Query Session Pool (Used / Idle)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "sum by (ydb_query_session_pool_name) (ydb_query_session_count{ydb_query_session_state=\"used\"})", + "legendFormat": "used - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "sum by (ydb_query_session_pool_name) (ydb_query_session_count{ydb_query_session_state=\"idle\"})", + "legendFormat": "idle - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "sum by (ydb_query_session_pool_name) (ydb_query_session_max)", + "legendFormat": "max - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Pending Session Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "sum by (ydb_query_session_pool_name) (ydb_query_session_pending_requests)", + "legendFormat": "{{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Session Acquire Timeouts", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "sum by (ydb_query_session_pool_name) (increase(ydb_query_session_timeouts_total[$__rate_interval]))", + "legendFormat": "{{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Operation Latency Percentiles", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "fieldConfig": { "defaults": { "unit": "s" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le, ydb_operation_name) (rate(db_client_operation_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p50 - {{ydb_operation_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "histogram_quantile(0.95, sum by (le, ydb_operation_name) (rate(db_client_operation_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p95 - {{ydb_operation_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "histogram_quantile(0.99, sum by (le, ydb_operation_name) (rate(db_client_operation_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p99 - {{ydb_operation_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Session Create Time Percentiles", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "fieldConfig": { "defaults": { "unit": "s" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le, ydb_query_session_pool_name) (rate(ydb_query_session_create_time_seconds_bucket[$__rate_interval])))", + "legendFormat": "p50 - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "histogram_quantile(0.95, sum by (le, ydb_query_session_pool_name) (rate(ydb_query_session_create_time_seconds_bucket[$__rate_interval])))", + "legendFormat": "p95 - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "histogram_quantile(0.99, sum by (le, ydb_query_session_pool_name) (rate(ydb_query_session_create_time_seconds_bucket[$__rate_interval])))", + "legendFormat": "p99 - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Retry Duration Percentiles", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "fieldConfig": { "defaults": { "unit": "s" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(ydb_client_retry_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p50", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(ydb_client_retry_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p95", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(ydb_client_retry_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p99", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + }, + { + "title": "Retry Attempts", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "options": { "legend": { "displayMode": "table", "placement": "right" } }, + "targets": [ + { + "expr": "sum(rate(ydb_client_retry_attempts_sum[$__rate_interval])) / sum(rate(ydb_client_retry_attempts_count[$__rate_interval]))", + "legendFormat": "avg attempts", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "sum(rate(ydb_client_retry_attempts_count[$__rate_interval])) - sum(rate(ydb_client_retry_attempts_bucket{le=\"1\"}[$__rate_interval]))", + "legendFormat": "retried operations rps", + "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "(sum(rate(ydb_client_retry_attempts_count[$__rate_interval])) - sum(rate(ydb_client_retry_attempts_bucket{le=\"1\"}[$__rate_interval]))) / sum(rate(ydb_client_retry_attempts_count[$__rate_interval]))", + "legendFormat": "retried operations ratio", + "datasource": { "type": "prometheus", "uid": "prometheus" } + } + ] + } + ] +} diff --git a/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml b/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml index 05ba5bd95..5898f3ce5 100644 --- a/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml +++ b/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml @@ -3,6 +3,7 @@ apiVersion: 1 datasources: - name: Prometheus type: prometheus + uid: prometheus access: proxy url: http://prometheus:9090 isDefault: true @@ -10,13 +11,14 @@ datasources: - name: Tempo type: tempo + uid: tempo access: proxy url: http://tempo:3200 editable: false jsonData: tracesToMetrics: - datasourceUid: Prometheus + datasourceUid: prometheus serviceMap: - datasourceUid: Prometheus + datasourceUid: prometheus diff --git a/examples/opentelemetry/load_tank.py b/examples/opentelemetry/load_tank.py new file mode 100644 index 000000000..2cdb67b65 --- /dev/null +++ b/examples/opentelemetry/load_tank.py @@ -0,0 +1,290 @@ +"""Small OpenTelemetry load generator for the YDB Python SDK example.""" + +from __future__ import annotations + +import asyncio +import os +import random +import time +from dataclasses import dataclass +from typing import AsyncIterator, Tuple, cast + +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics._internal.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource + +import ydb +from ydb.opentelemetry import enable_metrics + +from metrics_views import ydb_metrics_views + + +@dataclass(frozen=True) +class LoadConfig: + endpoint: str + database: str + otlp_endpoint: str + service_name: str + pool_size: int + worker_count: int + peak_rps: int + medium_rps: int + min_rps: int + peak_duration: int + medium_duration: int + min_duration: int + total_time: int + query: str + error_rps: int + error_query: str + pressure_pool_size: int + pressure_workers: int + pressure_hold_time: float + pressure_acquire_timeout: float + pressure_interval: float + session_churn_interval: float + + +def _env(name: str, default: str) -> str: + value = os.environ.get(name) + return value if value is not None and value != "" else default + + +def _env_int(name: str, default: int) -> int: + return int(_env(name, str(default))) + + +def _env_float(name: str, default: float) -> float: + return float(_env(name, str(default))) + + +def _load_config() -> LoadConfig: + return LoadConfig( + endpoint=_env("YDB_ENDPOINT", "grpc://localhost:2136"), + database=_env("YDB_DATABASE", "/local"), + otlp_endpoint=_env("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317"), + service_name=_env("OTEL_SERVICE_NAME", "ydb-python-load-tank"), + pool_size=_env_int("LOAD_TANK_POOL_SIZE", 20), + worker_count=_env_int("LOAD_TANK_WORKERS", 40), + peak_rps=_env_int("LOAD_TANK_PEAK_RPS", 120), + medium_rps=_env_int("LOAD_TANK_MEDIUM_RPS", 30), + min_rps=_env_int("LOAD_TANK_MIN_RPS", 3), + peak_duration=_env_int("LOAD_TANK_PEAK_DURATION", 60), + medium_duration=_env_int("LOAD_TANK_MEDIUM_DURATION", 90), + min_duration=_env_int("LOAD_TANK_MIN_DURATION", 60), + total_time=_env_int("LOAD_TANK_TOTAL_TIME", 300), + query=_env("LOAD_TANK_QUERY", "SELECT 1 AS value"), + error_rps=_env_int("LOAD_TANK_ERROR_RPS", 1), + error_query=_env("LOAD_TANK_ERROR_QUERY", "SELECT * FROM table_that_does_not_exist_for_metrics"), + pressure_pool_size=_env_int("LOAD_TANK_PRESSURE_POOL_SIZE", 1), + pressure_workers=_env_int("LOAD_TANK_PRESSURE_WORKERS", 8), + pressure_hold_time=_env_float("LOAD_TANK_PRESSURE_HOLD_TIME", 1.5), + pressure_acquire_timeout=_env_float("LOAD_TANK_PRESSURE_ACQUIRE_TIMEOUT", 1.0), + pressure_interval=_env_float("LOAD_TANK_PRESSURE_INTERVAL", 0.2), + session_churn_interval=_env_float("LOAD_TANK_SESSION_CHURN_INTERVAL", 2.0), + ) + + +async def _load_steps(config: LoadConfig) -> AsyncIterator[Tuple[int, str, int]]: + pattern = ( + (config.peak_rps, "Peak", config.peak_duration), + (config.medium_rps, "Medium down", config.medium_duration), + (config.min_rps, "Min", config.min_duration), + (config.medium_rps, "Medium up", config.medium_duration), + ) + deadline = time.monotonic() + config.total_time + + while time.monotonic() < deadline: + for rps, label, duration in pattern: + remaining = int(deadline - time.monotonic()) + if remaining <= 0: + return + yield rps, label, min(duration, remaining) + + +async def _worker( + pool: ydb.aio.QuerySessionPool, + queue: asyncio.Queue[object], + query: str, + stop: asyncio.Event, +) -> None: + while not stop.is_set(): + try: + await asyncio.wait_for(queue.get(), timeout=0.5) + except asyncio.TimeoutError: + continue + + try: + await pool.execute_with_retries(query) + except Exception as exc: + print("Load operation failed: %s" % exc) + finally: + queue.task_done() + + +async def _feed_phase(queue: asyncio.Queue[object], rps: int, duration: int) -> None: + interval = 1.0 / max(rps, 1) + deadline = time.monotonic() + duration + next_tick = time.monotonic() + + while time.monotonic() < deadline: + await queue.put(object()) + next_tick += interval + delay = next_tick - time.monotonic() + if delay > 0: + await asyncio.sleep(delay) + else: + await asyncio.sleep(0) + + +async def _error_worker(pool: ydb.aio.QuerySessionPool, config: LoadConfig, stop: asyncio.Event) -> None: + if config.error_rps <= 0: + return + + interval = 1.0 / config.error_rps + next_tick = time.monotonic() + + while not stop.is_set(): + try: + await pool.execute_with_retries(config.error_query) + except Exception: + pass + + next_tick += interval + delay = next_tick - time.monotonic() + if delay > 0: + try: + await asyncio.wait_for(stop.wait(), timeout=delay) + except asyncio.TimeoutError: + pass + else: + await asyncio.sleep(0) + + +async def _pressure_round(pool: ydb.aio.QuerySessionPool, config: LoadConfig) -> None: + async def holder() -> None: + async with pool.checkout(timeout=5): + await asyncio.sleep(config.pressure_hold_time) + + async def contender() -> None: + try: + async with pool.checkout(timeout=config.pressure_acquire_timeout): + pass + except Exception: + pass + + holder_task = asyncio.create_task(holder()) + await asyncio.sleep(0) + contenders = [asyncio.create_task(contender()) for _ in range(config.pressure_workers)] + await asyncio.gather(holder_task, *contenders, return_exceptions=True) + + +async def _pool_pressure_worker(driver: ydb.aio.Driver, config: LoadConfig, stop: asyncio.Event) -> None: + if config.pressure_workers <= 0 or config.pressure_pool_size <= 0: + return + + async with ydb.aio.QuerySessionPool( + driver, + size=config.pressure_pool_size, + name="pool-pressure", + ) as pool: + while not stop.is_set(): + await _pressure_round(pool, config) + try: + await asyncio.wait_for(stop.wait(), timeout=config.pressure_interval) + except asyncio.TimeoutError: + pass + + +async def _session_churn_worker(driver: ydb.aio.Driver, config: LoadConfig, stop: asyncio.Event) -> None: + if config.session_churn_interval <= 0: + return + + while not stop.is_set(): + async with ydb.aio.QuerySessionPool(driver, size=1, name="session-churn") as pool: + await pool.execute_with_retries("SELECT 1 AS value") + + try: + await asyncio.wait_for(stop.wait(), timeout=config.session_churn_interval) + except asyncio.TimeoutError: + pass + + +async def main() -> None: + config = _load_config() + + resource = Resource(attributes={"service.name": config.service_name}) + metric_reader = PeriodicExportingMetricReader( + OTLPMetricExporter(endpoint=config.otlp_endpoint), + export_interval_millis=2000, + ) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader], views=ydb_metrics_views()) + enable_metrics(meter_provider) + + print( + "=== YDB Python SDK load tank ===\n" + " total=%ss workers=%s pool_size=%s query=%r error_rps=%s\n" + " pressure_pool_size=%s pressure_workers=%s pressure_timeout=%ss session_churn_interval=%ss\n" + " pattern: Peak(%s rps, %ss) -> Medium(%s rps, %ss) -> " + "Min(%s rps, %ss) -> Medium -> repeat" + % ( + config.total_time, + config.worker_count, + config.pool_size, + config.query, + config.error_rps, + config.pressure_pool_size, + config.pressure_workers, + config.pressure_acquire_timeout, + config.session_churn_interval, + config.peak_rps, + config.peak_duration, + config.medium_rps, + config.medium_duration, + config.min_rps, + config.min_duration, + ) + ) + + async with ydb.aio.Driver( + endpoint=config.endpoint, + database=config.database, + disable_discovery=True, + ) as raw_driver: + driver = cast(ydb.aio.Driver, raw_driver) + await driver.wait(timeout=60) + + async with ydb.aio.QuerySessionPool(driver, size=config.pool_size, name="load-tank") as pool: + queue: asyncio.Queue[object] = asyncio.Queue(maxsize=max(config.worker_count * 4, config.peak_rps)) + stop = asyncio.Event() + workers = [ + asyncio.create_task(_worker(pool, queue, config.query, stop)) for _ in range(config.worker_count) + ] + error_task = asyncio.create_task(_error_worker(pool, config, stop)) + pressure_task = asyncio.create_task(_pool_pressure_worker(driver, config, stop)) + churn_task = asyncio.create_task(_session_churn_worker(driver, config, stop)) + + try: + async for rps, label, duration in _load_steps(config): + print("[%s] Phase: %s (%s RPS for %ss)" % (time.strftime("%H:%M:%S"), label, rps, duration)) + await _feed_phase(queue, rps, duration) + await asyncio.sleep(random.random() / 10.0) + + await queue.join() + finally: + stop.set() + for worker in workers: + worker.cancel() + error_task.cancel() + pressure_task.cancel() + churn_task.cancel() + await asyncio.gather(*workers, error_task, pressure_task, churn_task, return_exceptions=True) + + print("Waiting 10s to flush metrics...") + await asyncio.sleep(10) + meter_provider.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/opentelemetry/metrics_views.py b/examples/opentelemetry/metrics_views.py new file mode 100644 index 000000000..f6a5440d4 --- /dev/null +++ b/examples/opentelemetry/metrics_views.py @@ -0,0 +1,42 @@ +"""OpenTelemetry metric views for the YDB Python SDK examples.""" + +from opentelemetry.sdk.metrics.view import ExplicitBucketHistogramAggregation, View + +DURATION_BUCKETS_SECONDS = ( + 0.0005, + 0.001, + 0.0025, + 0.005, + 0.01, + 0.025, + 0.05, + 0.1, + 0.25, + 0.5, + 1.0, + 2.5, + 5.0, +) + +ATTEMPT_BUCKETS = (1, 2, 3, 5, 10) + + +def ydb_metrics_views(): + return [ + View( + instrument_name="db.client.operation.duration", + aggregation=ExplicitBucketHistogramAggregation(boundaries=DURATION_BUCKETS_SECONDS), + ), + View( + instrument_name="ydb.query.session.create_time", + aggregation=ExplicitBucketHistogramAggregation(boundaries=DURATION_BUCKETS_SECONDS), + ), + View( + instrument_name="ydb.client.retry.duration", + aggregation=ExplicitBucketHistogramAggregation(boundaries=DURATION_BUCKETS_SECONDS), + ), + View( + instrument_name="ydb.client.retry.attempts", + aggregation=ExplicitBucketHistogramAggregation(boundaries=ATTEMPT_BUCKETS), + ), + ] diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 3bdd6a6b8..102db2498 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -8,6 +8,7 @@ import asyncio import os +from typing import cast from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics import MeterProvider @@ -21,6 +22,8 @@ from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor +from metrics_views import ydb_metrics_views + def _env(name: str, default: str) -> str: v = os.environ.get(name) @@ -61,7 +64,7 @@ async def main() -> None: OTLPMetricExporter(endpoint=otlp_endpoint), export_interval_millis=1000, ) - meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader], views=ydb_metrics_views()) enable_metrics(meter_provider) async with ydb.aio.Driver( From b1372d49f4e1edf57e90b95dd43f2c9b85e3bc5c Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 03:10:42 +0300 Subject: [PATCH 21/30] format --- examples/opentelemetry/otel_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 102db2498..0b2ae0fed 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -8,7 +8,6 @@ import asyncio import os -from typing import cast from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics import MeterProvider From 5526975c6bc6f1ad42333a254a64b34f917bc1de Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 15:41:05 +0300 Subject: [PATCH 22/30] remove views and add explicit boundaries to histograms --- examples/opentelemetry/Dockerfile | 1 - examples/opentelemetry/README.md | 10 +++--- examples/opentelemetry/load_tank.py | 4 +-- examples/opentelemetry/metrics_views.py | 42 ------------------------- examples/opentelemetry/otel_example.py | 4 +-- tests/opentelemetry/test_metrics.py | 7 +++++ ydb/opentelemetry/metrics.py | 29 +++++++++++++++++ 7 files changed, 42 insertions(+), 55 deletions(-) delete mode 100644 examples/opentelemetry/metrics_views.py diff --git a/examples/opentelemetry/Dockerfile b/examples/opentelemetry/Dockerfile index c981ee04d..041eb4abf 100644 --- a/examples/opentelemetry/Dockerfile +++ b/examples/opentelemetry/Dockerfile @@ -20,4 +20,3 @@ RUN pip install --no-cache-dir -e '.[opentelemetry]' -r examples/opentelemetry/r # Demo scripts. COPY examples/opentelemetry/otel_example.py ./examples/opentelemetry/otel_example.py COPY examples/opentelemetry/load_tank.py ./examples/opentelemetry/load_tank.py -COPY examples/opentelemetry/metrics_views.py ./examples/opentelemetry/metrics_views.py diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index d1667ea8d..92ad5812f 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -54,12 +54,10 @@ Prometheus queries for SDK metrics such as `db_client_operation_duration`, `ydb_client_retry_duration`. Use Grafana Explore for ad-hoc traces through Tempo and metrics through Prometheus. -The examples configure custom OpenTelemetry histogram views in -[`metrics_views.py`](metrics_views.py). The SDK records duration values in -seconds, but the default histogram buckets are too coarse for fast local YDB -operations. The custom views keep the `s` unit and use sub-millisecond / -millisecond-scale buckets so Grafana percentiles show meaningful latency -distributions. +The SDK configures explicit OpenTelemetry histogram bucket boundaries for its +own duration and retry-attempt metrics. Duration values are recorded in seconds, +with sub-millisecond and millisecond-scale buckets so Grafana percentiles show +meaningful latency distributions for fast local YDB operations. **Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/compose-e2e.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. diff --git a/examples/opentelemetry/load_tank.py b/examples/opentelemetry/load_tank.py index 2cdb67b65..5d7b3f654 100644 --- a/examples/opentelemetry/load_tank.py +++ b/examples/opentelemetry/load_tank.py @@ -17,8 +17,6 @@ import ydb from ydb.opentelemetry import enable_metrics -from metrics_views import ydb_metrics_views - @dataclass(frozen=True) class LoadConfig: @@ -219,7 +217,7 @@ async def main() -> None: OTLPMetricExporter(endpoint=config.otlp_endpoint), export_interval_millis=2000, ) - meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader], views=ydb_metrics_views()) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) enable_metrics(meter_provider) print( diff --git a/examples/opentelemetry/metrics_views.py b/examples/opentelemetry/metrics_views.py deleted file mode 100644 index f6a5440d4..000000000 --- a/examples/opentelemetry/metrics_views.py +++ /dev/null @@ -1,42 +0,0 @@ -"""OpenTelemetry metric views for the YDB Python SDK examples.""" - -from opentelemetry.sdk.metrics.view import ExplicitBucketHistogramAggregation, View - -DURATION_BUCKETS_SECONDS = ( - 0.0005, - 0.001, - 0.0025, - 0.005, - 0.01, - 0.025, - 0.05, - 0.1, - 0.25, - 0.5, - 1.0, - 2.5, - 5.0, -) - -ATTEMPT_BUCKETS = (1, 2, 3, 5, 10) - - -def ydb_metrics_views(): - return [ - View( - instrument_name="db.client.operation.duration", - aggregation=ExplicitBucketHistogramAggregation(boundaries=DURATION_BUCKETS_SECONDS), - ), - View( - instrument_name="ydb.query.session.create_time", - aggregation=ExplicitBucketHistogramAggregation(boundaries=DURATION_BUCKETS_SECONDS), - ), - View( - instrument_name="ydb.client.retry.duration", - aggregation=ExplicitBucketHistogramAggregation(boundaries=DURATION_BUCKETS_SECONDS), - ), - View( - instrument_name="ydb.client.retry.attempts", - aggregation=ExplicitBucketHistogramAggregation(boundaries=ATTEMPT_BUCKETS), - ), - ] diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 0b2ae0fed..3bdd6a6b8 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -21,8 +21,6 @@ from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor -from metrics_views import ydb_metrics_views - def _env(name: str, default: str) -> str: v = os.environ.get(name) @@ -63,7 +61,7 @@ async def main() -> None: OTLPMetricExporter(endpoint=otlp_endpoint), export_interval_millis=1000, ) - meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader], views=ydb_metrics_views()) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) enable_metrics(meter_provider) async with ydb.aio.Driver( diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 2a1bc2c73..55d1d4b99 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -47,6 +47,9 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, RETRY_DURATION, + _ATTEMPT_BUCKETS, + _DURATION_BUCKETS_SECONDS, + _RETRY_DURATION_BUCKETS_SECONDS, create_metrics_operation, record_query_session_count, record_query_session_create_time, @@ -91,6 +94,10 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): assert metrics[QUERY_SESSION_TIMEOUTS].unit == "{connection}" assert metrics[RETRY_DURATION].unit == "s" assert metrics[RETRY_ATTEMPTS].unit == "{attempt}" + assert _single_point_from_metrics(metrics, CLIENT_OPERATION_DURATION).explicit_bounds == _DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, QUERY_SESSION_CREATE_TIME).explicit_bounds == _DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, RETRY_DURATION).explicit_bounds == _RETRY_DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, RETRY_ATTEMPTS).explicit_bounds == _ATTEMPT_BUCKETS def test_metrics_registry_is_noop_without_meter(): diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 7c6679e96..0926a6c3e 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -20,6 +20,31 @@ RETRY_ATTEMPTS = "ydb.client.retry.attempts" RETRY_DURATION = "ydb.client.retry.duration" +_DURATION_BUCKETS_SECONDS = ( + 0.001, + 0.005, + 0.01, + 0.05, + 0.1, + 0.5, + 1, + 5, + 10, +) +_RETRY_DURATION_BUCKETS_SECONDS = ( + 0.001, + 0.005, + 0.01, + 0.05, + 0.1, + 0.5, + 1, + 2, + 5, + 10, + 30, +) +_ATTEMPT_BUCKETS = (1, 2, 3, 4, 5, 7, 10, 20) _UNKNOWN_POOL = "unknown" _pool_name_counter = itertools.count(1) _pool_name_lock = threading.Lock() @@ -59,6 +84,7 @@ def set_meter( CLIENT_OPERATION_DURATION, unit="s", description="Duration of YDB client operations.", + explicit_bucket_boundaries_advisory=_DURATION_BUCKETS_SECONDS, ), CLIENT_OPERATION_FAILED: meter.create_counter( CLIENT_OPERATION_FAILED, @@ -75,6 +101,7 @@ def set_meter( QUERY_SESSION_CREATE_TIME, unit="s", description="Duration of YDB query session creation.", + explicit_bucket_boundaries_advisory=_DURATION_BUCKETS_SECONDS, ), QUERY_SESSION_PENDING_REQUESTS: meter.create_up_down_counter( QUERY_SESSION_PENDING_REQUESTS, @@ -99,6 +126,7 @@ def set_meter( "Total user-visible duration of a logical operation executed through the retry policy, " "including all attempts and back-off delays." ), + explicit_bucket_boundaries_advisory=_RETRY_DURATION_BUCKETS_SECONDS, ), RETRY_ATTEMPTS: meter.create_histogram( RETRY_ATTEMPTS, @@ -107,6 +135,7 @@ def set_meter( "Total number of attempts performed by the retry policy for one logical operation. " "A value of 1 means the operation succeeded on the first try." ), + explicit_bucket_boundaries_advisory=_ATTEMPT_BUCKETS, ), } From d7ad8bb31ed0dcc97cff10af9be4d1cb218b3607 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 18:31:39 +0300 Subject: [PATCH 23/30] refactoring --- docs/opentelemetry.rst | 10 +- examples/opentelemetry/README.md | 4 + tests/opentelemetry/test_metrics.py | 14 +- ydb/opentelemetry/__init__.py | 8 +- ydb/opentelemetry/metrics.py | 196 +++--------------- ydb/opentelemetry/metrics_plugin.py | 180 ++++++++++++++++ .../{plugin.py => tracing_plugin.py} | 54 ----- 7 files changed, 235 insertions(+), 231 deletions(-) create mode 100644 ydb/opentelemetry/metrics_plugin.py rename ydb/opentelemetry/{plugin.py => tracing_plugin.py} (71%) diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index fa2ce4834..95258177a 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -12,8 +12,11 @@ Metrics expose operation latency/failures, retry cost, and query session pool st Tracing and metrics are configured independently: enabling one does not require enabling the other. -Instrumentation is **zero-cost when disabled**: the SDK uses no-op stubs by default, so -there is no OpenTelemetry overhead unless you explicitly opt in. +Instrumentation is **zero-cost when disabled**: the SDK uses no-op tracing and +metrics registries by default, so importing the SDK does not import OpenTelemetry +or create metric instruments unless you explicitly opt in. ``enable_tracing()`` +loads the tracing plugin, while ``enable_metrics()`` loads the metrics plugin and +replaces the no-op metrics registry with an OpenTelemetry-backed registry. Installation @@ -118,7 +121,8 @@ SDK obtains a meter named ``"ydb.sdk"`` from the global meter provider. Repeated calls to ``enable_metrics()`` do nothing until you call ``disable_metrics()``, which clears the in-memory observable metric values and allows -metrics to be reconfigured. +metrics to be reconfigured. After disabling metrics, the SDK restores the no-op +metrics registry, so metric recording calls remain cheap no-ops. Metrics are independent from tracing. If both ``enable_tracing()`` and ``enable_metrics()`` are called, YDB client operations produce both spans and metrics. diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index 92ad5812f..33fd68991 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -59,6 +59,10 @@ own duration and retry-attempt metrics. Duration values are recorded in seconds, with sub-millisecond and millisecond-scale buckets so Grafana percentiles show meaningful latency distributions for fast local YDB operations. +Metrics are wired through a dedicated SDK metrics plugin. Until `enable_metrics()` +is called, the SDK uses a no-op metrics registry and does not import +OpenTelemetry metrics packages from the hot-path metric helpers. + **Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/compose-e2e.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. **Logs for `load-generator`:** the service is also one-shot. It runs for diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 55d1d4b99..f5dfa3542 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -47,9 +47,9 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, RETRY_DURATION, - _ATTEMPT_BUCKETS, - _DURATION_BUCKETS_SECONDS, - _RETRY_DURATION_BUCKETS_SECONDS, + ATTEMPT_BUCKETS, + DURATION_BUCKETS_SECONDS, + RETRY_DURATION_BUCKETS_SECONDS, create_metrics_operation, record_query_session_count, record_query_session_create_time, @@ -94,10 +94,10 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): assert metrics[QUERY_SESSION_TIMEOUTS].unit == "{connection}" assert metrics[RETRY_DURATION].unit == "s" assert metrics[RETRY_ATTEMPTS].unit == "{attempt}" - assert _single_point_from_metrics(metrics, CLIENT_OPERATION_DURATION).explicit_bounds == _DURATION_BUCKETS_SECONDS - assert _single_point_from_metrics(metrics, QUERY_SESSION_CREATE_TIME).explicit_bounds == _DURATION_BUCKETS_SECONDS - assert _single_point_from_metrics(metrics, RETRY_DURATION).explicit_bounds == _RETRY_DURATION_BUCKETS_SECONDS - assert _single_point_from_metrics(metrics, RETRY_ATTEMPTS).explicit_bounds == _ATTEMPT_BUCKETS + assert _single_point_from_metrics(metrics, CLIENT_OPERATION_DURATION).explicit_bounds == DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, QUERY_SESSION_CREATE_TIME).explicit_bounds == DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, RETRY_DURATION).explicit_bounds == RETRY_DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, RETRY_ATTEMPTS).explicit_bounds == ATTEMPT_BUCKETS def test_metrics_registry_is_noop_without_meter(): diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index 15c1e3cad..844c397e3 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -13,7 +13,7 @@ def enable_tracing(tracer=None): ``ydb.sdk`` from the global tracer provider will be used. """ try: - from ydb.opentelemetry.plugin import _enable_tracing + from ydb.opentelemetry.tracing_plugin import _enable_tracing except ImportError: raise ImportError( "OpenTelemetry packages are required for tracing support. " @@ -26,7 +26,7 @@ def enable_tracing(tracer=None): def disable_tracing(): """Disable YDB OpenTelemetry hooks and allow :func:`enable_tracing` to run again.""" try: - from ydb.opentelemetry.plugin import _disable_tracing + from ydb.opentelemetry.tracing_plugin import _disable_tracing except ImportError: return @@ -45,7 +45,7 @@ def enable_metrics(meter_provider=None): default meter named ``ydb.sdk`` from the global meter provider will be used. """ try: - from ydb.opentelemetry.plugin import _enable_metrics + from ydb.opentelemetry.metrics_plugin import _enable_metrics except ImportError: raise ImportError( "OpenTelemetry packages are required for metrics support. " @@ -58,7 +58,7 @@ def enable_metrics(meter_provider=None): def disable_metrics(): """Disable YDB OpenTelemetry metrics collection and allow :func:`enable_metrics` to run again.""" try: - from ydb.opentelemetry.plugin import _disable_metrics + from ydb.opentelemetry.metrics_plugin import _disable_metrics except ImportError: return diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index 0926a6c3e..a9ea7ba4d 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -1,8 +1,9 @@ """No-op-safe helpers for YDB OpenTelemetry client metrics. The SDK records metrics only after :func:`ydb.opentelemetry.enable_metrics` -installs OpenTelemetry instruments. Until then every helper is a cheap no-op, -which keeps metrics independent from tracing and safe to call from hot paths. +installs the OpenTelemetry-backed registry from ``metrics_plugin``. Until then +every helper delegates to a no-op registry, which keeps metrics independent from +tracing and safe to call from hot paths. """ import time @@ -20,7 +21,7 @@ RETRY_ATTEMPTS = "ydb.client.retry.attempts" RETRY_DURATION = "ydb.client.retry.duration" -_DURATION_BUCKETS_SECONDS = ( +DURATION_BUCKETS_SECONDS = ( 0.001, 0.005, 0.01, @@ -31,7 +32,7 @@ 5, 10, ) -_RETRY_DURATION_BUCKETS_SECONDS = ( +RETRY_DURATION_BUCKETS_SECONDS = ( 0.001, 0.005, 0.01, @@ -44,7 +45,7 @@ 10, 30, ) -_ATTEMPT_BUCKETS = (1, 2, 3, 4, 5, 7, 10, 20) +ATTEMPT_BUCKETS = (1, 2, 3, 4, 5, 7, 10, 20) _UNKNOWN_POOL = "unknown" _pool_name_counter = itertools.count(1) _pool_name_lock = threading.Lock() @@ -59,129 +60,34 @@ ) -class MetricsRegistry: - """Process-wide metric instrument registry. +class MetricRegistry: + """No-op metric registry used until the OpenTelemetry metrics plugin is enabled.""" - Regular instruments are recorded immediately. Observable query-session - instruments keep their latest values in memory and expose snapshots through - callbacks registered by ``ydb.opentelemetry.plugin``. - """ + enabled = False - def __init__(self) -> None: - self._instruments: Dict[str, Any] = {} - self._query_session_count_values: Dict[Any, int] = {} - self._query_session_max_values: Dict[Any, int] = {} - self._observable_values_lock = threading.Lock() - - def set_meter( - self, - meter: Any, - observe_query_session_count_callback: Any, - observe_query_session_max_callback: Any, - ) -> None: - self._instruments = { - CLIENT_OPERATION_DURATION: meter.create_histogram( - CLIENT_OPERATION_DURATION, - unit="s", - description="Duration of YDB client operations.", - explicit_bucket_boundaries_advisory=_DURATION_BUCKETS_SECONDS, - ), - CLIENT_OPERATION_FAILED: meter.create_counter( - CLIENT_OPERATION_FAILED, - unit="{command}", - description="Number of failed YDB client operations.", - ), - QUERY_SESSION_COUNT: meter.create_observable_up_down_counter( - QUERY_SESSION_COUNT, - callbacks=[observe_query_session_count_callback], - unit="{connection}", - description="Number of open YDB query sessions.", - ), - QUERY_SESSION_CREATE_TIME: meter.create_histogram( - QUERY_SESSION_CREATE_TIME, - unit="s", - description="Duration of YDB query session creation.", - explicit_bucket_boundaries_advisory=_DURATION_BUCKETS_SECONDS, - ), - QUERY_SESSION_PENDING_REQUESTS: meter.create_up_down_counter( - QUERY_SESSION_PENDING_REQUESTS, - unit="{request}", - description="Number of requests waiting for a YDB query session.", - ), - QUERY_SESSION_TIMEOUTS: meter.create_counter( - QUERY_SESSION_TIMEOUTS, - unit="{connection}", - description="Number of YDB query session acquisition timeouts.", - ), - QUERY_SESSION_MAX: meter.create_observable_up_down_counter( - QUERY_SESSION_MAX, - callbacks=[observe_query_session_max_callback], - unit="{connection}", - description="Maximum configured number of YDB query sessions.", - ), - RETRY_DURATION: meter.create_histogram( - RETRY_DURATION, - unit="s", - description=( - "Total user-visible duration of a logical operation executed through the retry policy, " - "including all attempts and back-off delays." - ), - explicit_bucket_boundaries_advisory=_RETRY_DURATION_BUCKETS_SECONDS, - ), - RETRY_ATTEMPTS: meter.create_histogram( - RETRY_ATTEMPTS, - unit="{attempt}", - description=( - "Total number of attempts performed by the retry policy for one logical operation. " - "A value of 1 means the operation succeeded on the first try." - ), - explicit_bucket_boundaries_advisory=_ATTEMPT_BUCKETS, - ), - } + def create_metrics_operation(self, name: str, attributes: Optional[Dict[str, Any]] = None): + return _NOOP_METRICS_OPERATION def clear(self) -> None: - self._instruments = {} - with self._observable_values_lock: - self._query_session_count_values = {} - self._query_session_max_values = {} + pass def add(self, name: str, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: - """Add ``value`` to a counter-like instrument if metrics are enabled.""" - instrument = self._instruments.get(name) - if instrument is not None: - instrument.add(value, attributes=attributes or {}) + pass def record(self, name: str, value: float, attributes: Optional[Dict[str, Any]] = None) -> None: - """Record ``value`` in a histogram-like instrument if metrics are enabled.""" - instrument = self._instruments.get(name) - if instrument is not None: - instrument.record(value, attributes=attributes or {}) + pass def add_query_session_count(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: - attrs = tuple(sorted((attributes or {}).items())) - - with self._observable_values_lock: - new_value = self._query_session_count_values.get(attrs, 0) + value - - self._query_session_count_values.pop(attrs, None) - self._query_session_count_values[attrs] = new_value + pass def get_query_session_count_values(self) -> Dict[Any, int]: - with self._observable_values_lock: - return dict(self._query_session_count_values) + return {} def set_query_session_max(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: - attrs = tuple(sorted((attributes or {}).items())) - - with self._observable_values_lock: - self._query_session_max_values[attrs] = value + pass def get_query_session_max_values(self) -> Dict[Any, int]: - with self._observable_values_lock: - return dict(self._query_session_max_values) - - -_metrics_registry: Optional[MetricsRegistry] = None + return {} class _NoopMetricsOperation: @@ -216,53 +122,35 @@ def __exit__(self, exc_type, exc_val, exc_tb): _NOOP_METRICS_OPERATION = _NoopMetricsOperation() +_NOOP_METRICS_REGISTRY = MetricRegistry() +_metrics_registry: MetricRegistry = _NOOP_METRICS_REGISTRY def is_metrics_enabled() -> bool: - return _metrics_registry is not None + return _metrics_registry.enabled -def enable_metrics_registry( - meter: Any, - observe_query_session_count_callback: Any, - observe_query_session_max_callback: Any, -) -> None: - global _metrics_registry - - if _metrics_registry is None: - _metrics_registry = MetricsRegistry() - _metrics_registry.set_meter(meter, observe_query_session_count_callback, observe_query_session_max_callback) +def next_query_session_pool_name() -> str: + """Return a process-unique default query session pool name for metric labels.""" + return "query-session-pool-%d" % next(_pool_name_counter) -def disable_metrics_registry() -> None: +def _set_metrics_registry(metrics_registry: MetricRegistry) -> None: global _metrics_registry - if _metrics_registry is not None: - _metrics_registry.clear() - _metrics_registry = None - - -def get_query_session_count_values() -> Dict[Any, int]: - if _metrics_registry is None: - return {} - return _metrics_registry.get_query_session_count_values() + _metrics_registry = metrics_registry +def _reset_metrics_registry() -> None: + global _metrics_registry -def get_query_session_max_values() -> Dict[Any, int]: - if _metrics_registry is None: - return {} - return _metrics_registry.get_query_session_max_values() + _metrics_registry.clear() + _metrics_registry = _NOOP_METRICS_REGISTRY def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: return {"ydb.query.session.pool.name": pool_name or _UNKNOWN_POOL} -def next_query_session_pool_name() -> str: - """Return a process-unique default query session pool name for metric labels.""" - return "query-session-pool-%d" % next(_pool_name_counter) - - def _operation_attrs(operation_name: str, attributes: Dict[str, Any]) -> Dict[str, Any]: return { "db.system.name": attributes.get("db.system.name", "ydb"), @@ -315,17 +203,13 @@ def end(self) -> None: return self._ended = True - registry = _metrics_registry - if registry is None: - return - duration = time.monotonic() - self._start_time - registry.record(CLIENT_OPERATION_DURATION, duration, self._attributes) + _metrics_registry.record(CLIENT_OPERATION_DURATION, duration, self._attributes) if self._exception is not None: attrs = dict(self._attributes) attrs["db.response.status_code"] = _response_status_code(self._exception) - registry.add(CLIENT_OPERATION_FAILED, 1, attrs) + _metrics_registry.add(CLIENT_OPERATION_FAILED, 1, attrs) def __enter__(self) -> "MetricsOperation": return self @@ -357,45 +241,31 @@ def __exit__(self, exc_type, exc_val, exc_tb): def create_metrics_operation(name: str, attributes: Optional[Dict[str, Any]] = None): - if _metrics_registry is None: - return _NOOP_METRICS_OPERATION - return MetricsOperation(name, attributes) + return _metrics_registry.create_metrics_operation(name, attributes) def record_query_session_count(delta: int, pool_name: Optional[str] = None, state: str = "used") -> None: - if _metrics_registry is None: - return attrs = _pool_attrs(pool_name) attrs["ydb.query.session.state"] = state _metrics_registry.add_query_session_count(delta, attrs) def record_query_session_create_time(duration: float, pool_name: Optional[str]) -> None: - if _metrics_registry is None: - return _metrics_registry.record(QUERY_SESSION_CREATE_TIME, duration, _pool_attrs(pool_name)) def record_query_session_pending_requests(delta: int, pool_name: Optional[str]) -> None: - if _metrics_registry is None: - return _metrics_registry.add(QUERY_SESSION_PENDING_REQUESTS, delta, _pool_attrs(pool_name)) def record_query_session_timeout(pool_name: Optional[str]) -> None: - if _metrics_registry is None: - return _metrics_registry.add(QUERY_SESSION_TIMEOUTS, 1, _pool_attrs(pool_name)) def record_query_session_max(value: int, pool_name: Optional[str]) -> None: - if _metrics_registry is None: - return _metrics_registry.set_query_session_max(value, _pool_attrs(pool_name)) def record_retry_metrics(duration: float, attempts: int) -> None: - if _metrics_registry is None: - return _metrics_registry.record(RETRY_DURATION, duration) _metrics_registry.record(RETRY_ATTEMPTS, attempts) diff --git a/ydb/opentelemetry/metrics_plugin.py b/ydb/opentelemetry/metrics_plugin.py new file mode 100644 index 000000000..408dd1e66 --- /dev/null +++ b/ydb/opentelemetry/metrics_plugin.py @@ -0,0 +1,180 @@ +"""OpenTelemetry metrics bridge for YDB.""" + +import threading +from typing import Any, Dict, Iterable, Optional, Union + +from opentelemetry import metrics as otel_metrics +from opentelemetry.metrics import ( + CallbackOptions, + Counter, + Histogram, + Meter, + MeterProvider, + ObservableUpDownCounter, + Observation, + UpDownCounter, +) + +from ydb.opentelemetry.metrics import ( + CLIENT_OPERATION_DURATION, + CLIENT_OPERATION_FAILED, + QUERY_SESSION_COUNT, + QUERY_SESSION_CREATE_TIME, + QUERY_SESSION_MAX, + QUERY_SESSION_PENDING_REQUESTS, + QUERY_SESSION_TIMEOUTS, + RETRY_ATTEMPTS, + RETRY_DURATION, + ATTEMPT_BUCKETS, + DURATION_BUCKETS_SECONDS, + RETRY_DURATION_BUCKETS_SECONDS, + MetricRegistry as NoOpMetricRegistry, + MetricsOperation, + _reset_metrics_registry, + _set_metrics_registry, +) + +_MetricInstrument = Union[Counter, Histogram, ObservableUpDownCounter, UpDownCounter] + +_meter: Optional[Meter] = None + + +class MetricsRegistry(NoOpMetricRegistry): + """Process-wide OpenTelemetry metric instrument registry.""" + + enabled = True + + def __init__(self, meter: Meter) -> None: + self._query_session_count_values: Dict[Any, int] = {} + self._query_session_max_values: Dict[Any, int] = {} + self._observable_values_lock = threading.Lock() + self._instruments: Dict[str, _MetricInstrument] = { + CLIENT_OPERATION_DURATION: meter.create_histogram( + CLIENT_OPERATION_DURATION, + unit="s", + description="Duration of YDB client operations.", + explicit_bucket_boundaries_advisory=DURATION_BUCKETS_SECONDS, + ), + CLIENT_OPERATION_FAILED: meter.create_counter( + CLIENT_OPERATION_FAILED, + unit="{command}", + description="Number of failed YDB client operations.", + ), + QUERY_SESSION_COUNT: meter.create_observable_up_down_counter( + QUERY_SESSION_COUNT, + callbacks=[self._observe_query_session_count], + unit="{connection}", + description="Number of open YDB query sessions.", + ), + QUERY_SESSION_CREATE_TIME: meter.create_histogram( + QUERY_SESSION_CREATE_TIME, + unit="s", + description="Duration of YDB query session creation.", + explicit_bucket_boundaries_advisory=DURATION_BUCKETS_SECONDS, + ), + QUERY_SESSION_PENDING_REQUESTS: meter.create_up_down_counter( + QUERY_SESSION_PENDING_REQUESTS, + unit="{request}", + description="Number of requests waiting for a YDB query session.", + ), + QUERY_SESSION_TIMEOUTS: meter.create_counter( + QUERY_SESSION_TIMEOUTS, + unit="{connection}", + description="Number of YDB query session acquisition timeouts.", + ), + QUERY_SESSION_MAX: meter.create_observable_up_down_counter( + QUERY_SESSION_MAX, + callbacks=[self._observe_query_session_max], + unit="{connection}", + description="Maximum configured number of YDB query sessions.", + ), + RETRY_DURATION: meter.create_histogram( + RETRY_DURATION, + unit="s", + description=( + "Total user-visible duration of a logical operation executed through the retry policy, " + "including all attempts and back-off delays." + ), + explicit_bucket_boundaries_advisory=RETRY_DURATION_BUCKETS_SECONDS, + ), + RETRY_ATTEMPTS: meter.create_histogram( + RETRY_ATTEMPTS, + unit="{attempt}", + description=( + "Total number of attempts performed by the retry policy for one logical operation. " + "A value of 1 means the operation succeeded on the first try." + ), + explicit_bucket_boundaries_advisory=ATTEMPT_BUCKETS, + ), + } + + def create_metrics_operation(self, name: str, attributes: Optional[Dict[str, Any]] = None) -> MetricsOperation: + return MetricsOperation(name, attributes) + + def clear(self) -> None: + self._instruments = {} + with self._observable_values_lock: + self._query_session_count_values = {} + self._query_session_max_values = {} + + def add(self, name: str, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: + """Add ``value`` to a counter-like instrument if metrics are enabled.""" + instrument = self._instruments.get(name) + if instrument is not None: + instrument.add(value, attributes=attributes or {}) + + def record(self, name: str, value: float, attributes: Optional[Dict[str, Any]] = None) -> None: + """Record ``value`` in a histogram-like instrument if metrics are enabled.""" + instrument = self._instruments.get(name) + if instrument is not None: + instrument.record(value, attributes=attributes or {}) + + def add_query_session_count(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: + attrs = tuple(sorted((attributes or {}).items())) + + with self._observable_values_lock: + new_value = self._query_session_count_values.get(attrs, 0) + value + + self._query_session_count_values.pop(attrs, None) + self._query_session_count_values[attrs] = new_value + + def set_query_session_max(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: + attrs = tuple(sorted((attributes or {}).items())) + + with self._observable_values_lock: + self._query_session_max_values[attrs] = value + + def _observe_query_session_count(self, _: CallbackOptions) -> Iterable[Observation]: + return self._observe(self._query_session_count_values) + + def _observe_query_session_max(self, _: CallbackOptions) -> Iterable[Observation]: + return self._observe(self._query_session_max_values) + + def _observe(self, values: Dict[Any, int]) -> Iterable[Observation]: + with self._observable_values_lock: + return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] + + +def _enable_metrics(meter_provider: Optional[MeterProvider]) -> None: + """Create SDK metric instruments from an OTel MeterProvider and enable recording.""" + global _meter + + if _meter is not None: + return + + if meter_provider is None: + _meter = otel_metrics.get_meter("ydb.sdk") + elif hasattr(meter_provider, "get_meter"): + _meter = meter_provider.get_meter("ydb.sdk") + else: + raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") + + registry = MetricsRegistry(_meter) + _set_metrics_registry(registry) + + +def _disable_metrics() -> None: + global _meter + + _reset_metrics_registry() + _meter = None diff --git a/ydb/opentelemetry/plugin.py b/ydb/opentelemetry/tracing_plugin.py similarity index 71% rename from ydb/opentelemetry/plugin.py rename to ydb/opentelemetry/tracing_plugin.py index b0e2b722f..59cabde1f 100644 --- a/ydb/opentelemetry/plugin.py +++ b/ydb/opentelemetry/tracing_plugin.py @@ -2,19 +2,11 @@ from opentelemetry import context as otel_context from opentelemetry import trace -from opentelemetry import metrics as otel_metrics -from opentelemetry.metrics import Observation from opentelemetry.propagate import inject from opentelemetry.trace import StatusCode from ydb import issues from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry.metrics import ( - disable_metrics_registry, - enable_metrics_registry, - get_query_session_count_values, - get_query_session_max_values, -) from ydb.opentelemetry.tracing import _registry as _tracing_registry # YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. @@ -30,7 +22,6 @@ _tracer = None _tracing_enabled = False -_meter = None _KIND_MAP = { "client": trace.SpanKind.CLIENT, @@ -141,48 +132,3 @@ def _disable_tracing(): _tracing_registry.set_metadata_hook(None) _tracing_enabled = False _tracer = None - - -def _create_observable_callback(get_values): - """Create callback for observable metrics backed by the metrics registry.""" - - def observe(_): - values = get_values() - return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] - - return observe - - -def _create_query_session_count_callback(): - """Create callback for observable query session count metric.""" - return _create_observable_callback(get_query_session_count_values) - - -def _create_query_session_max_callback(): - """Create callback for observable query session max metric.""" - return _create_observable_callback(get_query_session_max_values) - - -def _enable_metrics(meter_provider): - """Create SDK metric instruments from an OTel MeterProvider and enable recording.""" - global _meter - - if _meter is not None: - return - - if meter_provider is None: - _meter = otel_metrics.get_meter("ydb.sdk") - elif hasattr(meter_provider, "get_meter"): - _meter = meter_provider.get_meter("ydb.sdk") - else: - raise TypeError("meter_provider must be an OpenTelemetry MeterProvider") - - enable_metrics_registry(_meter, _create_query_session_count_callback(), _create_query_session_max_callback()) - - -def _disable_metrics(): - global _meter - - disable_metrics_registry() - if _meter is not None: - _meter = None From f836acf3ec3aef02ccf117511e5287c32ac31e8e Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 18:39:43 +0300 Subject: [PATCH 24/30] add endpoint to pool name --- tests/opentelemetry/test_metrics.py | 33 +++++++++++++++++++++++++++++ ydb/aio/query/pool.py | 5 +++-- ydb/opentelemetry/metrics.py | 5 +++++ ydb/query/pool.py | 5 +++-- 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index f5dfa3542..31ab4a368 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -338,6 +338,22 @@ def test_sync_query_session_pool_records_max(metrics_setup): assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "sync-pool"} +def test_sync_query_session_pool_uses_endpoint_as_default_pool_name(metrics_setup): + from tests.opentelemetry.conftest import FakeDriverConfig + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + from ydb.query.pool import QuerySessionPool + + class FakeDriver: + _driver_config = FakeDriverConfig(endpoint="grpc://localhost:2136") + + QuerySessionPool(driver=FakeDriver(), size=42) + + assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 42 + assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == { + "ydb.query.session.pool.name": "grpc://localhost:2136" + } + + @pytest.mark.asyncio async def test_async_query_session_pool_records_max(metrics_setup): from ydb.aio.query.pool import QuerySessionPool @@ -349,6 +365,23 @@ async def test_async_query_session_pool_records_max(metrics_setup): assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "async-pool"} +@pytest.mark.asyncio +async def test_async_query_session_pool_uses_endpoint_as_default_pool_name(metrics_setup): + from tests.opentelemetry.conftest import FakeDriverConfig + from ydb.aio.query.pool import QuerySessionPool + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + + class FakeDriver: + _driver_config = FakeDriverConfig(endpoint="grpc://localhost:2136") + + QuerySessionPool(driver=FakeDriver(), size=24) + + assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 24 + assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == { + "ydb.query.session.pool.name": "grpc://localhost:2136" + } + + @pytest.mark.asyncio async def test_sync_and_async_query_session_pool_auto_names_do_not_collide(metrics_setup): from ydb.aio.query.pool import QuerySessionPool as AsyncQuerySessionPool diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index 547b458e0..b2145c097 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -24,7 +24,7 @@ from ... import convert from ... import issues from ...opentelemetry.metrics import ( - next_query_session_pool_name, + query_session_pool_name, record_query_session_count, record_query_session_create_time, record_query_session_max, @@ -63,7 +63,8 @@ def __init__( self._current_size = 0 self._loop = asyncio.get_running_loop() if loop is None else loop self._query_client_settings = query_client_settings - self._metrics_pool_name = name or next_query_session_pool_name() + driver_config = getattr(driver, "_driver_config", None) + self._metrics_pool_name = query_session_pool_name(name, getattr(driver_config, "endpoint", None)) record_query_session_max(self._size, self._metrics_pool_name) async def _create_new_session(self): diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index a9ea7ba4d..f623354c5 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -135,11 +135,16 @@ def next_query_session_pool_name() -> str: return "query-session-pool-%d" % next(_pool_name_counter) +def query_session_pool_name(name: Optional[str], endpoint: Optional[str]) -> str: + return name or endpoint or next_query_session_pool_name() + + def _set_metrics_registry(metrics_registry: MetricRegistry) -> None: global _metrics_registry _metrics_registry = metrics_registry + def _reset_metrics_registry() -> None: global _metrics_registry diff --git a/ydb/query/pool.py b/ydb/query/pool.py index cfa45aded..41198bcc4 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -28,7 +28,7 @@ from .. import convert from ..settings import BaseRequestSettings from ..opentelemetry.metrics import ( - next_query_session_pool_name, + query_session_pool_name, record_query_session_count, record_query_session_create_time, record_query_session_max, @@ -73,7 +73,8 @@ def __init__( self._should_stop = threading.Event() self._lock = threading.RLock() self._query_client_settings = query_client_settings - self._metrics_pool_name = name or next_query_session_pool_name() + driver_config = getattr(driver, "_driver_config", None) + self._metrics_pool_name = query_session_pool_name(name, getattr(driver_config, "endpoint", None)) record_query_session_max(self._size, self._metrics_pool_name) def _create_new_session(self, timeout: Optional[float]): From ef7260af66e70aa0ce2a0deb3784cb413d3db5ab Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 19:24:21 +0300 Subject: [PATCH 25/30] change attributes --- docs/opentelemetry.rst | 19 +++++----- tests/opentelemetry/test_metrics.py | 51 +++++++++++++++++--------- ydb/opentelemetry/_endpoint.py | 21 +++++++++++ ydb/opentelemetry/metrics.py | 57 +++++++++++++++++++++++------ ydb/opentelemetry/tracing.py | 36 +++--------------- 5 files changed, 116 insertions(+), 68 deletions(-) create mode 100644 ydb/opentelemetry/_endpoint.py diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 95258177a..1fbdec8ed 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -284,19 +284,18 @@ Operation metrics use stable labels only: * - Attribute - Description - * - ``db.system.name`` - - Always ``"ydb"``. - * - ``db.namespace`` + * - ``database`` - Database path. - * - ``server.address`` - - Host from the configured endpoint. - * - ``server.port`` - - Port from the configured endpoint. - * - ``ydb.operation.name`` - - SDK operation name, for example ``"ydb.ExecuteQuery"``. - * - ``db.response.status_code`` + * - ``endpoint`` + - Configured endpoint in ``host:port`` form. + * - ``operation.name`` + - SDK operation name without the ``ydb.`` prefix, for example ``"ExecuteQuery"``. + * - ``status_code`` - Added only to ``ydb.client.operation.failed``. +Operation metrics are recorded for ``ExecuteQuery``, ``Commit``, ``Rollback``, +``CreateSession``, and ``BeginTransaction``. + Query session metrics use ``ydb.query.session.pool.name``. The pool name is generated automatically, or can be set explicitly with ``QuerySessionPool(..., name="main-pool")`` for both synchronous and asynchronous pools. ``ydb.query.session.count`` also includes diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 31ab4a368..18f3ab62a 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -127,9 +127,8 @@ def test_metrics_operation_records_duration_once(metrics_setup, monkeypatch): operation = create_metrics_operation( "ExecuteQuery", { - "db.namespace": "/Root/test", - "server.address": "localhost", - "server.port": 2136, + "database": "/Root/test", + "endpoint": "localhost:2136", }, ) operation.end() @@ -140,11 +139,9 @@ def test_metrics_operation_records_duration_once(metrics_setup, monkeypatch): assert point.sum == 0.25 assert point.count == 1 assert point.attributes == { - "db.system.name": "ydb", - "db.namespace": "/Root/test", - "server.address": "localhost", - "server.port": 2136, - "ydb.operation.name": "ExecuteQuery", + "database": "/Root/test", + "endpoint": "localhost:2136", + "operation.name": "ExecuteQuery", } @@ -161,8 +158,8 @@ def test_metrics_operation_records_ydb_error(metrics_setup, monkeypatch): point = _single_point(metrics_setup, CLIENT_OPERATION_FAILED) assert point.value == 1 - assert point.attributes["db.response.status_code"] == "UNAVAILABLE" - assert point.attributes["ydb.operation.name"] == "ExecuteQuery" + assert point.attributes["status_code"] == "UNAVAILABLE" + assert point.attributes["operation.name"] == "ExecuteQuery" def test_metrics_operation_records_generic_error_status_code(metrics_setup): @@ -172,23 +169,27 @@ def test_metrics_operation_records_generic_error_status_code(metrics_setup): with create_metrics_operation("ExecuteQuery"): raise ValueError("bad value") - assert _single_point(metrics_setup, CLIENT_OPERATION_FAILED).attributes["db.response.status_code"] == "ValueError" + assert _single_point(metrics_setup, CLIENT_OPERATION_FAILED).attributes["status_code"] == "ValueError" def test_metrics_operation_set_attribute(metrics_setup): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation operation = create_metrics_operation("ExecuteQuery") - operation.set_attribute("db.namespace", "/Root/test") + operation.set_attribute("database", "/Root/test") operation.end() - assert _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes["db.namespace"] == "/Root/test" + assert _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes["database"] == "/Root/test" def test_metrics_operation_ignores_non_metric_attributes(metrics_setup): from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation operation = create_metrics_operation("ExecuteQuery") + operation.set_attribute("db.namespace", "/Root/test") + operation.set_attribute("server.address", "localhost") + operation.set_attribute("server.port", 2136) + operation.set_attribute("ydb.operation.name", "ydb.Commit") operation.set_attribute("network.peer.address", "node.example.net") operation.set_attribute("network.peer.port", 2136) operation.set_attribute("ydb.node.dc", "dc-a") @@ -197,6 +198,10 @@ def test_metrics_operation_ignores_non_metric_attributes(metrics_setup): attrs = _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes + assert "db.namespace" not in attrs + assert "server.address" not in attrs + assert "server.port" not in attrs + assert "ydb.operation.name" not in attrs assert "network.peer.address" not in attrs assert "network.peer.port" not in attrs assert "ydb.node.dc" not in attrs @@ -219,6 +224,15 @@ def test_metrics_operation_respects_end_on_exit_false(metrics_setup): assert point.sum >= 0 +def test_metrics_operation_ignores_unknown_operation_name(metrics_setup): + from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION, create_metrics_operation + + with create_metrics_operation("ydb.Driver.Initialize"): + pass + + assert CLIENT_OPERATION_DURATION not in _metrics_by_name(metrics_setup) + + def test_create_ydb_span_records_metrics_when_tracing_is_active(metrics_setup, tracing_setup): from tests.opentelemetry.conftest import FakeDriverConfig from ydb.opentelemetry.metrics import CLIENT_OPERATION_DURATION @@ -243,7 +257,9 @@ def test_create_ydb_span_records_metrics_when_tracing_is_active(metrics_setup, t assert span_attrs["ydb.node.id"] == 123 metric_attrs = _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes - assert metric_attrs["ydb.operation.name"] == "ydb.ExecuteQuery" + assert metric_attrs["database"] == "/test_database" + assert metric_attrs["endpoint"] == "test_endpoint:1337" + assert metric_attrs["operation.name"] == "ExecuteQuery" assert "network.peer.address" not in metric_attrs assert "network.peer.port" not in metric_attrs assert "ydb.node.dc" not in metric_attrs @@ -260,9 +276,10 @@ def test_create_ydb_span_records_metrics_when_tracing_is_disabled(metrics_setup) with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig()).attach_context(): pass - assert ( - _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes["ydb.operation.name"] == "ydb.ExecuteQuery" - ) + metric_attrs = _single_point(metrics_setup, CLIENT_OPERATION_DURATION).attributes + assert metric_attrs["database"] == "/test_database" + assert metric_attrs["endpoint"] == "test_endpoint:1337" + assert metric_attrs["operation.name"] == "ExecuteQuery" def test_query_session_count_accumulates_by_attributes(metrics_setup): diff --git a/ydb/opentelemetry/_endpoint.py b/ydb/opentelemetry/_endpoint.py new file mode 100644 index 000000000..a88b19bc3 --- /dev/null +++ b/ydb/opentelemetry/_endpoint.py @@ -0,0 +1,21 @@ +from typing import Optional, Tuple + + +def split_endpoint(endpoint: Optional[str]) -> Tuple[str, int]: + ep = endpoint or "" + if ep.startswith("grpcs://"): + ep = ep[len("grpcs://") :] + elif ep.startswith("grpc://"): + ep = ep[len("grpc://") :] + + if ep.startswith("["): + close = ep.find("]") + if close != -1 and len(ep) > close + 1 and ep[close + 1] == ":": + host = ep[: close + 1] + port_s = ep[close + 2 :] + return host, int(port_s) if port_s.isdigit() else 0 + + host, sep, port_s = ep.rpartition(":") + if not sep: + return ep, 0 + return host, int(port_s) if port_s.isdigit() else 0 diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index f623354c5..d45aecd96 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -11,6 +11,8 @@ import itertools from typing import Any, Dict, Optional +from ydb.opentelemetry._endpoint import split_endpoint + CLIENT_OPERATION_DURATION = "db.client.operation.duration" CLIENT_OPERATION_FAILED = "ydb.client.operation.failed" QUERY_SESSION_COUNT = "ydb.query.session.count" @@ -51,13 +53,32 @@ _pool_name_lock = threading.Lock() _OPERATION_ATTR_KEYS = frozenset( { - "db.system.name", - "db.namespace", - "server.address", - "server.port", - "ydb.operation.name", + "database", + "endpoint", + "operation.name", + } +) +_CLIENT_OPERATION_NAMES = frozenset( + { + "ExecuteQuery", + "Commit", + "Rollback", + "CreateSession", + "BeginTransaction", } ) +_CLIENT_OPERATION_NAME_BY_INPUT = { + "ydb.ExecuteQuery": "ExecuteQuery", + "ExecuteQuery": "ExecuteQuery", + "ydb.Commit": "Commit", + "Commit": "Commit", + "ydb.Rollback": "Rollback", + "Rollback": "Rollback", + "ydb.CreateSession": "CreateSession", + "CreateSession": "CreateSession", + "ydb.BeginTransaction": "BeginTransaction", + "BeginTransaction": "BeginTransaction", +} class MetricRegistry: @@ -156,13 +177,25 @@ def _pool_attrs(pool_name: Optional[str]) -> Dict[str, Any]: return {"ydb.query.session.pool.name": pool_name or _UNKNOWN_POOL} +def _build_ydb_metrics_attrs(driver_config) -> Dict[str, Any]: + host, port = split_endpoint(getattr(driver_config, "endpoint", None)) + endpoint = "%s:%d" % (host, port) if port else host + return { + "database": getattr(driver_config, "database", None) or "", + "endpoint": endpoint, + } + + +def _operation_name(operation_name: str) -> str: + return _CLIENT_OPERATION_NAME_BY_INPUT.get(operation_name, operation_name) + + def _operation_attrs(operation_name: str, attributes: Dict[str, Any]) -> Dict[str, Any]: + name = _operation_name(operation_name) return { - "db.system.name": attributes.get("db.system.name", "ydb"), - "db.namespace": attributes.get("db.namespace", ""), - "server.address": attributes.get("server.address", ""), - "server.port": attributes.get("server.port", 0), - "ydb.operation.name": operation_name, + "database": attributes.get("database", ""), + "endpoint": attributes.get("endpoint", ""), + "operation.name": name, } @@ -213,7 +246,7 @@ def end(self) -> None: if self._exception is not None: attrs = dict(self._attributes) - attrs["db.response.status_code"] = _response_status_code(self._exception) + attrs["status_code"] = _response_status_code(self._exception) _metrics_registry.add(CLIENT_OPERATION_FAILED, 1, attrs) def __enter__(self) -> "MetricsOperation": @@ -246,6 +279,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): def create_metrics_operation(name: str, attributes: Optional[Dict[str, Any]] = None): + if _operation_name(name) not in _CLIENT_OPERATION_NAMES: + return _NOOP_METRICS_OPERATION return _metrics_registry.create_metrics_operation(name, attributes) diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 262a99c53..2fdd74b3c 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -1,9 +1,9 @@ """Internal SDK tracing helpers and telemetry facade.""" import enum -from typing import Optional, Tuple -from ydb.opentelemetry.metrics import create_metrics_operation, is_metrics_enabled +from ydb.opentelemetry._endpoint import split_endpoint +from ydb.opentelemetry.metrics import _build_ydb_metrics_attrs, create_metrics_operation, is_metrics_enabled class SpanName(str, enum.Enum): @@ -135,38 +135,14 @@ def get_trace_metadata(): return _registry.get_trace_metadata() -def _split_endpoint(endpoint: Optional[str]) -> Tuple[str, int]: - ep = endpoint or "" - if ep.startswith("grpcs://"): - ep = ep[len("grpcs://") :] - elif ep.startswith("grpc://"): - ep = ep[len("grpc://") :] - - if ep.startswith("["): - close = ep.find("]") - if close != -1 and len(ep) > close + 1 and ep[close + 1] == ":": - host = ep[: close + 1] - port_s = ep[close + 2 :] - return host, int(port_s) if port_s.isdigit() else 0 - - host, sep, port_s = ep.rpartition(":") - if not sep: - return ep, 0 - return host, int(port_s) if port_s.isdigit() else 0 - - -def _build_ydb_attrs(driver_config): - host, port = _split_endpoint(getattr(driver_config, "endpoint", None)) - return { +def _build_ydb_tracing_attrs(driver_config, node_id=None, peer=None): + host, port = split_endpoint(getattr(driver_config, "endpoint", None)) + attrs = { "db.system.name": "ydb", "db.namespace": getattr(driver_config, "database", None) or "", "server.address": host, "server.port": port, } - - -def _build_ydb_tracing_attrs(driver_config, node_id=None, peer=None): - attrs = _build_ydb_attrs(driver_config) if peer is not None: address, port_, location = peer if address is not None: @@ -191,7 +167,7 @@ def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): Tracing receives full operation context, including peer/node details. Metrics receive only the stable labels defined for client operation metrics. """ - metrics_attrs = _build_ydb_attrs(driver_config) if is_metrics_enabled() else None + metrics_attrs = _build_ydb_metrics_attrs(driver_config) if is_metrics_enabled() else None tracing_attrs = _build_ydb_tracing_attrs(driver_config, node_id, peer) metrics = create_metrics_operation(name, metrics_attrs) return _TelemetryOperation(_registry.create_span(name, attributes=tracing_attrs, kind=kind), metrics) From 1284153a9a889bf6fc9424a4f8aaf3ee55742d92 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 20:01:02 +0300 Subject: [PATCH 26/30] add pool min metric and add removing observable metrics when pool is closed --- docs/opentelemetry.rst | 5 ++ .../grafana/dashboards/README.md | 6 ++- .../dashboards/ydb-python-sdk-metrics.json | 5 ++ tests/opentelemetry/test_metrics.py | 47 ++++++++++++++++++- ydb/aio/query/pool.py | 2 + ydb/opentelemetry/metrics.py | 12 +++-- ydb/opentelemetry/metrics_plugin.py | 23 +++++++++ ydb/query/pool.py | 2 + 8 files changed, 93 insertions(+), 9 deletions(-) diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 1fbdec8ed..b0d633146 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -267,6 +267,11 @@ The SDK creates the following instruments with meter name ``"ydb.sdk"``: - ObservableUpDownCounter - ``{connection}`` - Maximum configured number of sessions for a query session pool. + * - ``ydb.query.session.min`` + - ObservableUpDownCounter + - ``{connection}`` + - Minimum configured number of sessions for a query session pool. The SDK does not configure + a pool minimum, so this metric is always reported as ``0``. * - ``ydb.client.retry.duration`` - Histogram - ``s`` diff --git a/examples/opentelemetry/grafana/dashboards/README.md b/examples/opentelemetry/grafana/dashboards/README.md index eb47493ad..365fd38bf 100644 --- a/examples/opentelemetry/grafana/dashboards/README.md +++ b/examples/opentelemetry/grafana/dashboards/README.md @@ -1,5 +1,7 @@ -This folder is intentionally left empty. +This folder contains Grafana dashboards provisioned by the local OpenTelemetry example. -Grafana is provisioned with Tempo + Prometheus datasources; use **Explore** to search traces. +`ydb-python-sdk-metrics.json` shows the YDB Python SDK metrics exported to Prometheus, +including client operation latency, failures, query session pool usage, pool min/max, +pending session requests, acquisition timeouts, and retry metrics. diff --git a/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json b/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json index 798b51717..1a0706aa8 100644 --- a/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json +++ b/examples/opentelemetry/grafana/dashboards/ydb-python-sdk-metrics.json @@ -55,6 +55,11 @@ "expr": "sum by (ydb_query_session_pool_name) (ydb_query_session_max)", "legendFormat": "max - {{ydb_query_session_pool_name}}", "datasource": { "type": "prometheus", "uid": "prometheus" } + }, + { + "expr": "sum by (ydb_query_session_pool_name) (ydb_query_session_min)", + "legendFormat": "min - {{ydb_query_session_pool_name}}", + "datasource": { "type": "prometheus", "uid": "prometheus" } } ] }, diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 18f3ab62a..72eef8eda 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -27,6 +27,13 @@ def _single_point_from_metrics(metrics, name): return points[0] +def _points(reader, name): + metrics = _metrics_by_name(reader) + if name not in metrics: + return [] + return list(metrics[name].data.data_points) + + def _histogram_sum(reader, name): return _single_point(reader, name).sum @@ -43,6 +50,7 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): QUERY_SESSION_COUNT, QUERY_SESSION_CREATE_TIME, QUERY_SESSION_MAX, + QUERY_SESSION_MIN, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, @@ -80,6 +88,7 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): QUERY_SESSION_COUNT, QUERY_SESSION_CREATE_TIME, QUERY_SESSION_MAX, + QUERY_SESSION_MIN, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, @@ -90,6 +99,7 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): assert metrics[QUERY_SESSION_COUNT].unit == "{connection}" assert metrics[QUERY_SESSION_CREATE_TIME].unit == "s" assert metrics[QUERY_SESSION_MAX].unit == "{connection}" + assert metrics[QUERY_SESSION_MIN].unit == "{connection}" assert metrics[QUERY_SESSION_PENDING_REQUESTS].unit == "{request}" assert metrics[QUERY_SESSION_TIMEOUTS].unit == "{connection}" assert metrics[RETRY_DURATION].unit == "s" @@ -316,6 +326,7 @@ def test_query_session_helpers_record_pool_attributes(metrics_setup): from ydb.opentelemetry.metrics import ( QUERY_SESSION_CREATE_TIME, QUERY_SESSION_MAX, + QUERY_SESSION_MIN, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, record_query_session_create_time, @@ -334,6 +345,7 @@ def test_query_session_helpers_record_pool_attributes(metrics_setup): pending_requests = _single_point_from_metrics(metrics, QUERY_SESSION_PENDING_REQUESTS) timeouts = _single_point_from_metrics(metrics, QUERY_SESSION_TIMEOUTS) session_max = _single_point_from_metrics(metrics, QUERY_SESSION_MAX) + session_min = _single_point_from_metrics(metrics, QUERY_SESSION_MIN) assert create_time.sum == 0.5 assert create_time.attributes == {"ydb.query.session.pool.name": "main"} @@ -343,16 +355,32 @@ def test_query_session_helpers_record_pool_attributes(metrics_setup): assert timeouts.attributes == {"ydb.query.session.pool.name": "main"} assert session_max.value == 100 assert session_max.attributes == {"ydb.query.session.pool.name": "main"} + assert session_min.value == 0 + assert session_min.attributes == {"ydb.query.session.pool.name": "main"} def test_sync_query_session_pool_records_max(metrics_setup): - from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX, QUERY_SESSION_MIN from ydb.query.pool import QuerySessionPool QuerySessionPool(driver=object(), size=42, name="sync-pool") assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 42 assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "sync-pool"} + assert _single_point(metrics_setup, QUERY_SESSION_MIN).value == 0 + assert _single_point(metrics_setup, QUERY_SESSION_MIN).attributes == {"ydb.query.session.pool.name": "sync-pool"} + + +def test_sync_query_session_pool_stop_removes_observable_metrics(metrics_setup): + from ydb.opentelemetry.metrics import QUERY_SESSION_COUNT, QUERY_SESSION_MAX, QUERY_SESSION_MIN + from ydb.query.pool import QuerySessionPool + + pool = QuerySessionPool(driver=object(), size=42, name="sync-pool") + pool.stop() + + assert _points(metrics_setup, QUERY_SESSION_COUNT) == [] + assert _points(metrics_setup, QUERY_SESSION_MAX) == [] + assert _points(metrics_setup, QUERY_SESSION_MIN) == [] def test_sync_query_session_pool_uses_endpoint_as_default_pool_name(metrics_setup): @@ -374,12 +402,27 @@ class FakeDriver: @pytest.mark.asyncio async def test_async_query_session_pool_records_max(metrics_setup): from ydb.aio.query.pool import QuerySessionPool - from ydb.opentelemetry.metrics import QUERY_SESSION_MAX + from ydb.opentelemetry.metrics import QUERY_SESSION_MAX, QUERY_SESSION_MIN QuerySessionPool(driver=object(), size=24, name="async-pool") assert _single_point(metrics_setup, QUERY_SESSION_MAX).value == 24 assert _single_point(metrics_setup, QUERY_SESSION_MAX).attributes == {"ydb.query.session.pool.name": "async-pool"} + assert _single_point(metrics_setup, QUERY_SESSION_MIN).value == 0 + assert _single_point(metrics_setup, QUERY_SESSION_MIN).attributes == {"ydb.query.session.pool.name": "async-pool"} + + +@pytest.mark.asyncio +async def test_async_query_session_pool_stop_removes_observable_metrics(metrics_setup): + from ydb.aio.query.pool import QuerySessionPool + from ydb.opentelemetry.metrics import QUERY_SESSION_COUNT, QUERY_SESSION_MAX, QUERY_SESSION_MIN + + pool = QuerySessionPool(driver=object(), size=24, name="async-pool") + await pool.stop() + + assert _points(metrics_setup, QUERY_SESSION_COUNT) == [] + assert _points(metrics_setup, QUERY_SESSION_MAX) == [] + assert _points(metrics_setup, QUERY_SESSION_MIN) == [] @pytest.mark.asyncio diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index b2145c097..16c44b842 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -30,6 +30,7 @@ record_query_session_max, record_query_session_pending_requests, record_query_session_timeout, + remove_query_session_pool_metrics, ) from ..._grpc.grpcwrapper import common_utils from ..._grpc.grpcwrapper import ydb_query_public_types as _ydb_query_public @@ -296,6 +297,7 @@ async def stop(self): await asyncio.gather(*tasks) logger.debug("All session were deleted.") + remove_query_session_pool_metrics(self._metrics_pool_name) async def __aenter__(self): return self diff --git a/ydb/opentelemetry/metrics.py b/ydb/opentelemetry/metrics.py index d45aecd96..7afec52d2 100644 --- a/ydb/opentelemetry/metrics.py +++ b/ydb/opentelemetry/metrics.py @@ -20,6 +20,7 @@ QUERY_SESSION_PENDING_REQUESTS = "ydb.query.session.pending_requests" QUERY_SESSION_TIMEOUTS = "ydb.query.session.timeouts" QUERY_SESSION_MAX = "ydb.query.session.max" +QUERY_SESSION_MIN = "ydb.query.session.min" RETRY_ATTEMPTS = "ydb.client.retry.attempts" RETRY_DURATION = "ydb.client.retry.duration" @@ -101,14 +102,11 @@ def record(self, name: str, value: float, attributes: Optional[Dict[str, Any]] = def add_query_session_count(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: pass - def get_query_session_count_values(self) -> Dict[Any, int]: - return {} - def set_query_session_max(self, value: int, attributes: Optional[Dict[str, Any]] = None) -> None: pass - def get_query_session_max_values(self) -> Dict[Any, int]: - return {} + def remove_query_session_pool(self, attributes: Optional[Dict[str, Any]] = None) -> None: + pass class _NoopMetricsOperation: @@ -306,6 +304,10 @@ def record_query_session_max(value: int, pool_name: Optional[str]) -> None: _metrics_registry.set_query_session_max(value, _pool_attrs(pool_name)) +def remove_query_session_pool_metrics(pool_name: Optional[str]) -> None: + _metrics_registry.remove_query_session_pool(_pool_attrs(pool_name)) + + def record_retry_metrics(duration: float, attempts: int) -> None: _metrics_registry.record(RETRY_DURATION, duration) _metrics_registry.record(RETRY_ATTEMPTS, attempts) diff --git a/ydb/opentelemetry/metrics_plugin.py b/ydb/opentelemetry/metrics_plugin.py index 408dd1e66..dde1fced9 100644 --- a/ydb/opentelemetry/metrics_plugin.py +++ b/ydb/opentelemetry/metrics_plugin.py @@ -21,6 +21,7 @@ QUERY_SESSION_COUNT, QUERY_SESSION_CREATE_TIME, QUERY_SESSION_MAX, + QUERY_SESSION_MIN, QUERY_SESSION_PENDING_REQUESTS, QUERY_SESSION_TIMEOUTS, RETRY_ATTEMPTS, @@ -88,6 +89,12 @@ def __init__(self, meter: Meter) -> None: unit="{connection}", description="Maximum configured number of YDB query sessions.", ), + QUERY_SESSION_MIN: meter.create_observable_up_down_counter( + QUERY_SESSION_MIN, + callbacks=[self._observe_query_session_min], + unit="{connection}", + description="Minimum configured number of YDB query sessions.", + ), RETRY_DURATION: meter.create_histogram( RETRY_DURATION, unit="s", @@ -144,12 +151,28 @@ def set_query_session_max(self, value: int, attributes: Optional[Dict[str, Any]] with self._observable_values_lock: self._query_session_max_values[attrs] = value + def remove_query_session_pool(self, attributes: Optional[Dict[str, Any]] = None) -> None: + base_attrs = list((attributes or {}).items()) + attrs = tuple(sorted(base_attrs)) + idle_attrs = tuple(sorted(base_attrs + [("ydb.query.session.state", "idle")])) + used_attrs = tuple(sorted(base_attrs + [("ydb.query.session.state", "used")])) + + with self._observable_values_lock: + self._query_session_count_values.pop(idle_attrs, None) + self._query_session_count_values.pop(used_attrs, None) + self._query_session_max_values.pop(attrs, None) + + def _observe_query_session_count(self, _: CallbackOptions) -> Iterable[Observation]: return self._observe(self._query_session_count_values) def _observe_query_session_max(self, _: CallbackOptions) -> Iterable[Observation]: return self._observe(self._query_session_max_values) + def _observe_query_session_min(self, _: CallbackOptions) -> Iterable[Observation]: + with self._observable_values_lock: + return [Observation(0, attributes=dict(attrs)) for attrs in self._query_session_max_values] + def _observe(self, values: Dict[Any, int]) -> Iterable[Observation]: with self._observable_values_lock: return [Observation(value, attributes=dict(attrs)) for attrs, value in values.items()] diff --git a/ydb/query/pool.py b/ydb/query/pool.py index 41198bcc4..c4b5a24d4 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -34,6 +34,7 @@ record_query_session_max, record_query_session_pending_requests, record_query_session_timeout, + remove_query_session_pool_metrics, ) from .._grpc.grpcwrapper import ydb_query_public_types as _ydb_query_public @@ -344,6 +345,7 @@ def stop(self, timeout=None): break logger.debug("All session were deleted.") + remove_query_session_pool_metrics(self._metrics_pool_name) finally: if acquired: self._lock.release() From fb42b9ec17d112f46a1ac85b015e328fb20c5007 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 20:18:40 +0300 Subject: [PATCH 27/30] format --- examples/opentelemetry/load_tank.py | 3 +-- ydb/opentelemetry/metrics_plugin.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/opentelemetry/load_tank.py b/examples/opentelemetry/load_tank.py index 5d7b3f654..bd175cfc5 100644 --- a/examples/opentelemetry/load_tank.py +++ b/examples/opentelemetry/load_tank.py @@ -249,8 +249,7 @@ async def main() -> None: endpoint=config.endpoint, database=config.database, disable_discovery=True, - ) as raw_driver: - driver = cast(ydb.aio.Driver, raw_driver) + ) as driver: await driver.wait(timeout=60) async with ydb.aio.QuerySessionPool(driver, size=config.pool_size, name="load-tank") as pool: diff --git a/ydb/opentelemetry/metrics_plugin.py b/ydb/opentelemetry/metrics_plugin.py index dde1fced9..af6d52ddf 100644 --- a/ydb/opentelemetry/metrics_plugin.py +++ b/ydb/opentelemetry/metrics_plugin.py @@ -162,7 +162,6 @@ def remove_query_session_pool(self, attributes: Optional[Dict[str, Any]] = None) self._query_session_count_values.pop(used_attrs, None) self._query_session_max_values.pop(attrs, None) - def _observe_query_session_count(self, _: CallbackOptions) -> Iterable[Observation]: return self._observe(self._query_session_count_values) From be2347fd1af5e7265b64ef118b58f9ef4f523d41 Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 20:20:33 +0300 Subject: [PATCH 28/30] format --- examples/opentelemetry/load_tank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/opentelemetry/load_tank.py b/examples/opentelemetry/load_tank.py index bd175cfc5..801ddadf5 100644 --- a/examples/opentelemetry/load_tank.py +++ b/examples/opentelemetry/load_tank.py @@ -7,7 +7,7 @@ import random import time from dataclasses import dataclass -from typing import AsyncIterator, Tuple, cast +from typing import AsyncIterator, Tuple from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics import MeterProvider From 68a515c4e6de81dec7b6465a042bd69f49b6c19d Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 20:39:53 +0300 Subject: [PATCH 29/30] supports old api --- tests/opentelemetry/test_metrics.py | 30 +++++++++++++++++++ ydb/opentelemetry/metrics_plugin.py | 45 ++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index 72eef8eda..b6c87f3ee 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -110,6 +110,36 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): assert _single_point_from_metrics(metrics, RETRY_ATTEMPTS).explicit_bounds == ATTEMPT_BUCKETS +def test_metrics_registry_supports_old_histogram_api(): + from ydb.opentelemetry.metrics_plugin import MetricsRegistry + + class FakeInstrument: + def add(self, value, attributes=None): + pass + + def record(self, value, attributes=None): + pass + + class FakeMeter: + def create_histogram(self, name, unit="", description="", **kwargs): + if "explicit_bucket_boundaries_advisory" in kwargs: + raise TypeError( + "create_histogram() got an unexpected keyword argument 'explicit_bucket_boundaries_advisory'" + ) + return FakeInstrument() + + def create_counter(self, name, unit="", description=""): + return FakeInstrument() + + def create_up_down_counter(self, name, unit="", description=""): + return FakeInstrument() + + def create_observable_up_down_counter(self, name, callbacks=None, unit="", description=""): + return FakeInstrument() + + MetricsRegistry(FakeMeter()) + + def test_metrics_registry_is_noop_without_meter(): from ydb.opentelemetry.metrics import ( create_metrics_operation, diff --git a/ydb/opentelemetry/metrics_plugin.py b/ydb/opentelemetry/metrics_plugin.py index af6d52ddf..9968cefb5 100644 --- a/ydb/opentelemetry/metrics_plugin.py +++ b/ydb/opentelemetry/metrics_plugin.py @@ -50,11 +50,12 @@ def __init__(self, meter: Meter) -> None: self._query_session_max_values: Dict[Any, int] = {} self._observable_values_lock = threading.Lock() self._instruments: Dict[str, _MetricInstrument] = { - CLIENT_OPERATION_DURATION: meter.create_histogram( + CLIENT_OPERATION_DURATION: _create_histogram( + meter, CLIENT_OPERATION_DURATION, unit="s", description="Duration of YDB client operations.", - explicit_bucket_boundaries_advisory=DURATION_BUCKETS_SECONDS, + bucket_boundaries=DURATION_BUCKETS_SECONDS, ), CLIENT_OPERATION_FAILED: meter.create_counter( CLIENT_OPERATION_FAILED, @@ -67,11 +68,12 @@ def __init__(self, meter: Meter) -> None: unit="{connection}", description="Number of open YDB query sessions.", ), - QUERY_SESSION_CREATE_TIME: meter.create_histogram( + QUERY_SESSION_CREATE_TIME: _create_histogram( + meter, QUERY_SESSION_CREATE_TIME, unit="s", description="Duration of YDB query session creation.", - explicit_bucket_boundaries_advisory=DURATION_BUCKETS_SECONDS, + bucket_boundaries=DURATION_BUCKETS_SECONDS, ), QUERY_SESSION_PENDING_REQUESTS: meter.create_up_down_counter( QUERY_SESSION_PENDING_REQUESTS, @@ -95,23 +97,25 @@ def __init__(self, meter: Meter) -> None: unit="{connection}", description="Minimum configured number of YDB query sessions.", ), - RETRY_DURATION: meter.create_histogram( + RETRY_DURATION: _create_histogram( + meter, RETRY_DURATION, unit="s", description=( "Total user-visible duration of a logical operation executed through the retry policy, " "including all attempts and back-off delays." ), - explicit_bucket_boundaries_advisory=RETRY_DURATION_BUCKETS_SECONDS, + bucket_boundaries=RETRY_DURATION_BUCKETS_SECONDS, ), - RETRY_ATTEMPTS: meter.create_histogram( + RETRY_ATTEMPTS: _create_histogram( + meter, RETRY_ATTEMPTS, unit="{attempt}", description=( "Total number of attempts performed by the retry policy for one logical operation. " "A value of 1 means the operation succeeded on the first try." ), - explicit_bucket_boundaries_advisory=ATTEMPT_BUCKETS, + bucket_boundaries=ATTEMPT_BUCKETS, ), } @@ -200,3 +204,28 @@ def _disable_metrics() -> None: _reset_metrics_registry() _meter = None + + +def _create_histogram( + meter: Meter, + name: str, + unit: str, + description: str, + bucket_boundaries, +) -> Histogram: + """Create a histogram with bucket advice when the installed OpenTelemetry SDK supports it.""" + try: + return meter.create_histogram( + name, + unit=unit, + description=description, + explicit_bucket_boundaries_advisory=bucket_boundaries, + ) + except TypeError as e: + if "explicit_bucket_boundaries_advisory" not in str(e): + raise + return meter.create_histogram( + name, + unit=unit, + description=description, + ) From d0fc62404d8522bd46449cdc26b8525b195e36ec Mon Sep 17 00:00:00 2001 From: tewbo Date: Fri, 22 May 2026 20:51:23 +0300 Subject: [PATCH 30/30] fix test --- tests/opentelemetry/test_metrics.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/opentelemetry/test_metrics.py b/tests/opentelemetry/test_metrics.py index b6c87f3ee..874f9ca60 100644 --- a/tests/opentelemetry/test_metrics.py +++ b/tests/opentelemetry/test_metrics.py @@ -1,6 +1,8 @@ +import inspect from unittest.mock import MagicMock import pytest +from opentelemetry.metrics import Meter def _metrics_by_name(reader): @@ -42,6 +44,10 @@ def _sum_value(reader, name): return _single_point(reader, name).value +def _histogram_boundaries_advisory_supported(): + return "explicit_bucket_boundaries_advisory" in inspect.signature(Meter.create_histogram).parameters + + def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): from ydb import issues from ydb.opentelemetry.metrics import ( @@ -104,10 +110,15 @@ def test_metrics_registry_records_all_instruments(metrics_setup, monkeypatch): assert metrics[QUERY_SESSION_TIMEOUTS].unit == "{connection}" assert metrics[RETRY_DURATION].unit == "s" assert metrics[RETRY_ATTEMPTS].unit == "{attempt}" - assert _single_point_from_metrics(metrics, CLIENT_OPERATION_DURATION).explicit_bounds == DURATION_BUCKETS_SECONDS - assert _single_point_from_metrics(metrics, QUERY_SESSION_CREATE_TIME).explicit_bounds == DURATION_BUCKETS_SECONDS - assert _single_point_from_metrics(metrics, RETRY_DURATION).explicit_bounds == RETRY_DURATION_BUCKETS_SECONDS - assert _single_point_from_metrics(metrics, RETRY_ATTEMPTS).explicit_bounds == ATTEMPT_BUCKETS + if _histogram_boundaries_advisory_supported(): + assert ( + _single_point_from_metrics(metrics, CLIENT_OPERATION_DURATION).explicit_bounds == DURATION_BUCKETS_SECONDS + ) + assert ( + _single_point_from_metrics(metrics, QUERY_SESSION_CREATE_TIME).explicit_bounds == DURATION_BUCKETS_SECONDS + ) + assert _single_point_from_metrics(metrics, RETRY_DURATION).explicit_bounds == RETRY_DURATION_BUCKETS_SECONDS + assert _single_point_from_metrics(metrics, RETRY_ATTEMPTS).explicit_bounds == ATTEMPT_BUCKETS def test_metrics_registry_supports_old_histogram_api():