From b7cb1aa6f53921cb0484d95cb7385cb7308a0f13 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 10:24:28 -0600 Subject: [PATCH 01/24] implement canonical metrics, feature flagged to preserve legacy where legacy ones were released --- harness/main.py | 23 +- harness/manifests/deployment.yaml | 4 + .../client/automator/async_task_runner.py | 23 +- .../client/automator/task_handler.py | 16 +- src/conductor/client/automator/task_runner.py | 23 +- .../settings/metrics_settings.py | 19 +- .../client/orkes/orkes_base_client.py | 5 +- .../client/orkes/orkes_workflow_client.py | 44 +- src/conductor/client/orkes_clients.py | 7 +- .../telemetry/canonical_metrics_collector.py | 215 ++++ .../telemetry/legacy_metrics_collector.py | 304 ++++++ .../client/telemetry/metrics_collector.py | 947 +----------------- .../telemetry/metrics_collector_base.py | 467 +++++++++ .../client/telemetry/metrics_factory.py | 45 + .../telemetry/model/metric_documentation.py | 9 + .../client/telemetry/model/metric_label.py | 3 +- .../client/telemetry/model/metric_name.py | 9 + .../workflow/executor/workflow_executor.py | 6 +- .../test_canonical_metrics_collector.py | 216 ++++ tests/unit/telemetry/test_metrics_factory.py | 117 +++ 20 files changed, 1554 insertions(+), 948 deletions(-) create mode 100644 src/conductor/client/telemetry/canonical_metrics_collector.py create mode 100644 src/conductor/client/telemetry/legacy_metrics_collector.py create mode 100644 src/conductor/client/telemetry/metrics_collector_base.py create mode 100644 src/conductor/client/telemetry/metrics_factory.py create mode 100644 tests/unit/telemetry/test_canonical_metrics_collector.py create mode 100644 tests/unit/telemetry/test_metrics_factory.py diff --git a/harness/main.py b/harness/main.py index 28a92b3df..2be7d8e6c 100644 --- a/harness/main.py +++ b/harness/main.py @@ -9,6 +9,7 @@ from conductor.client.configuration.settings.metrics_settings import MetricsSettings from conductor.client.http.models.task_def import TaskDef from conductor.client.orkes_clients import OrkesClients +from conductor.client.telemetry.metrics_factory import create_metrics_collector from conductor.client.workflow.conductor_workflow import ConductorWorkflow from conductor.client.workflow.task.simple_task import SimpleTask @@ -67,7 +68,16 @@ def register_metadata(clients: OrkesClients) -> None: def main() -> None: configuration = Configuration() - clients = OrkesClients(configuration) + + metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) + metrics_settings = MetricsSettings(http_port=metrics_port) + print(f"Prometheus metrics will be served on port {metrics_port}") + + # Main-process MetricsCollector: writes workflow-client / HTTP metrics into + # the shared PROMETHEUS_MULTIPROC_DIR so they merge with worker subprocess + # metrics in the exported /metrics payload. + metrics_collector = create_metrics_collector(metrics_settings) + clients = OrkesClients(configuration, metrics_collector=metrics_collector) register_metadata(clients) @@ -80,10 +90,6 @@ def main() -> None: worker = SimulatedTaskWorker(task_name, codename, sleep_seconds, batch_size, poll_interval_ms) workers.append(worker) - metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) - metrics_settings = MetricsSettings(http_port=metrics_port) - print(f"Prometheus metrics will be served on port {metrics_port}") - task_handler = TaskHandler( workers=workers, configuration=configuration, @@ -93,7 +99,6 @@ def main() -> None: workflow_executor = clients.get_workflow_executor() governor = WorkflowGovernor(workflow_executor, WORKFLOW_NAME, workflows_per_sec) - governor.start() main_pid = os.getpid() shutting_down = False @@ -111,7 +116,13 @@ def shutdown(signum, frame): signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) + # Fork worker processes BEFORE starting the governor thread. The parent's + # MetricsCollector uses prometheus_client's multiprocess mode, which guards + # its mmapped value files with a threading.Lock. If the governor thread is + # already running at fork() time, a child can inherit that lock in a held + # state and deadlock on its first metric write. task_handler.start_processes() + governor.start() task_handler.join_processes() diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 048559faf..0043716b0 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -51,6 +51,10 @@ spec: - name: HARNESS_POLL_INTERVAL_MS value: "100" + # === METRICS MODE === + - name: WORKER_CANONICAL_METRICS + value: "true" + ports: - name: metrics containerPort: 9991 diff --git a/src/conductor/client/automator/async_task_runner.py b/src/conductor/client/automator/async_task_runner.py index ba3e36534..116c6ce36 100644 --- a/src/conductor/client/automator/async_task_runner.py +++ b/src/conductor/client/automator/async_task_runner.py @@ -27,7 +27,7 @@ from conductor.client.http.rest import AuthorizationException, ApiException from conductor.client.orkes.orkes_metadata_client import OrkesMetadataClient from conductor.client.orkes.orkes_schema_client import OrkesSchemaClient -from conductor.client.telemetry.metrics_collector import MetricsCollector +from conductor.client.telemetry.metrics_factory import create_metrics_collector from conductor.client.worker.worker_interface import WorkerInterface from conductor.client.worker.worker_config import resolve_worker_config, get_worker_config_oneline from conductor.client.worker.exception import NonRetryableException @@ -88,7 +88,7 @@ def __init__( self.metrics_collector = None if metrics_settings is not None: - self.metrics_collector = MetricsCollector( + self.metrics_collector = create_metrics_collector( metrics_settings ) # Register metrics collector as event listener @@ -863,6 +863,7 @@ async def __async_update_task(self, task_result: TaskResult): if attempt > 0: # Exponential backoff: [10s, 20s, 30s] before retry await asyncio.sleep(attempt * 10) + update_start = time.time() try: if self._use_update_v2: next_task = await self.async_task_client.update_task_v2(body=task_result) @@ -873,6 +874,10 @@ async def __async_update_task(self, task_result: TaskResult): task_definition_name, next_task.task_id if next_task else None ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return next_task else: await self.async_task_client.update_task(body=task_result) @@ -882,6 +887,10 @@ async def __async_update_task(self, task_result: TaskResult): task_result.workflow_instance_id, task_definition_name, ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except ApiException as e: if e.status in (404, 405) and self._use_update_v2: @@ -895,12 +904,19 @@ async def __async_update_task(self, task_result: TaskResult): # Retry immediately with v1 try: await self.async_task_client.update_task(body=task_result) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except Exception as fallback_e: last_exception = fallback_e continue last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) @@ -917,6 +933,9 @@ async def __async_update_task(self, task_result: TaskResult): except Exception as e: last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index 2e3616f11..6a8255bd0 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -18,7 +18,7 @@ from conductor.client.event.task_runner_events import TaskRunnerEvent from conductor.client.event.sync_event_dispatcher import SyncEventDispatcher from conductor.client.event.sync_listener_register import register_task_runner_listener -from conductor.client.telemetry.metrics_collector import MetricsCollector +from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase from conductor.client.telemetry.model.metric_documentation import MetricDocumentation from conductor.client.telemetry.model.metric_label import MetricLabel from conductor.client.telemetry.model.metric_name import MetricName @@ -36,10 +36,14 @@ _mp_fork_set = False if not _mp_fork_set: try: - if platform == "win32": - set_start_method("spawn") - else: - set_start_method("fork") + # The prometheus_client library holds a module-level threading lock; + # forking while that lock is held causes a deadlock in child processes. + # Set CONDUCTOR_MP_START_METHOD=spawn to avoid this if you hit the + # deadlock. Default is fork for backward compatibility (spawn requires + # all Process arguments to be picklable). + _default_method = "spawn" if platform == "win32" else "fork" + _method = os.environ.get("CONDUCTOR_MP_START_METHOD", _default_method).strip().lower() + set_start_method(_method) _mp_fork_set = True except Exception as e: logger.info("error when setting multiprocessing.set_start_method - maybe the context is set %s", e.args) @@ -345,7 +349,7 @@ def __create_metrics_provider_process(self, metrics_settings: MetricsSettings) - self.metrics_provider_process = None return self.metrics_provider_process = Process( - target=MetricsCollector.provide_metrics, + target=MetricsCollectorBase.provide_metrics, args=(metrics_settings,) ) logger.info("Created MetricsProvider process") diff --git a/src/conductor/client/automator/task_runner.py b/src/conductor/client/automator/task_runner.py index af566de10..25df5961e 100644 --- a/src/conductor/client/automator/task_runner.py +++ b/src/conductor/client/automator/task_runner.py @@ -30,7 +30,7 @@ from conductor.client.http.rest import AuthorizationException, ApiException from conductor.client.orkes.orkes_metadata_client import OrkesMetadataClient from conductor.client.orkes.orkes_schema_client import OrkesSchemaClient -from conductor.client.telemetry.metrics_collector import MetricsCollector +from conductor.client.telemetry.metrics_factory import create_metrics_collector from conductor.client.worker.worker import ASYNC_TASK_RUNNING from conductor.client.worker.worker_interface import WorkerInterface from conductor.client.worker.worker_config import resolve_worker_config, get_worker_config_oneline @@ -69,7 +69,7 @@ def __init__( self.metrics_collector = None if metrics_settings is not None: - self.metrics_collector = MetricsCollector( + self.metrics_collector = create_metrics_collector( metrics_settings ) # Register metrics collector as event listener @@ -972,6 +972,7 @@ def __update_task(self, task_result: TaskResult): if attempt > 0: # Exponential backoff: [10s, 20s, 30s] before retry time.sleep(attempt * 10) + update_start = time.time() try: if self._use_update_v2: next_task = self.task_client.update_task_v2(body=task_result) @@ -982,6 +983,10 @@ def __update_task(self, task_result: TaskResult): task_definition_name, next_task.task_id if next_task else None ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return next_task else: self.task_client.update_task(body=task_result) @@ -991,6 +996,10 @@ def __update_task(self, task_result: TaskResult): task_result.workflow_instance_id, task_definition_name, ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except ApiException as e: if e.status in (404, 405) and self._use_update_v2: @@ -1004,12 +1013,19 @@ def __update_task(self, task_result: TaskResult): # Retry immediately with v1 try: self.task_client.update_task(body=task_result) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except Exception as fallback_e: last_exception = fallback_e continue last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) @@ -1044,6 +1060,9 @@ def __update_task(self, task_result: TaskResult): except Exception as e: last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index 18a4c96bc..4d4bb6eec 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -24,7 +24,8 @@ def __init__( directory: Optional[str] = None, file_name: str = "metrics.log", update_interval: float = 0.1, - http_port: Optional[int] = None): + http_port: Optional[int] = None, + clean_directory: bool = True): """ Configure metrics collection settings. @@ -40,6 +41,10 @@ def __init__( If None: - Metrics will be written to file at {directory}/{file_name} - No HTTP server will be started + clean_directory: If True (default), remove stale prometheus_client + .db files from the directory on init. Without this, + metric names from previous configurations persist in the + multiprocess directory and appear in /metrics output. """ if directory is None: directory = get_default_temporary_folder() @@ -47,6 +52,8 @@ def __init__( self.file_name = file_name self.update_interval = update_interval self.http_port = http_port + if clean_directory: + self._clean_stale_db_files() def __set_dir(self, dir: str) -> None: if not os.path.isdir(dir): @@ -57,3 +64,13 @@ def __set_dir(self, dir: str) -> None: "Failed to create metrics temporary folder, reason: %s", e) self.directory = dir + + def _clean_stale_db_files(self) -> None: + """Remove leftover prometheus_client multiprocess .db files.""" + import glob + pattern = os.path.join(self.directory, "*.db") + for path in glob.glob(pattern): + try: + os.remove(path) + except Exception as e: + logger.debug("Could not remove stale metrics db file %s: %s", path, e) diff --git a/src/conductor/client/orkes/orkes_base_client.py b/src/conductor/client/orkes/orkes_base_client.py index 2e04533f8..3c5314ad8 100644 --- a/src/conductor/client/orkes/orkes_base_client.py +++ b/src/conductor/client/orkes/orkes_base_client.py @@ -22,8 +22,9 @@ class OrkesBaseClient(object): - def __init__(self, configuration: Configuration): - self.api_client = ApiClient(configuration) + def __init__(self, configuration: Configuration, metrics_collector=None): + self.metrics_collector = metrics_collector + self.api_client = ApiClient(configuration, metrics_collector=metrics_collector) self.logger = logging.getLogger( Configuration.get_logging_formatted_name(__name__) ) diff --git a/src/conductor/client/orkes/orkes_workflow_client.py b/src/conductor/client/orkes/orkes_workflow_client.py index 24472260e..7ef501c07 100644 --- a/src/conductor/client/orkes/orkes_workflow_client.py +++ b/src/conductor/client/orkes/orkes_workflow_client.py @@ -1,4 +1,5 @@ from __future__ import annotations +import json from typing import Optional, List, Dict from conductor.client.configuration.configuration import Configuration @@ -18,9 +19,10 @@ class OrkesWorkflowClient(OrkesBaseClient, WorkflowClient): def __init__( self, - configuration: Configuration + configuration: Configuration, + metrics_collector=None, ): - super(OrkesWorkflowClient, self).__init__(configuration) + super(OrkesWorkflowClient, self).__init__(configuration, metrics_collector=metrics_collector) def start_workflow_by_name( self, @@ -38,10 +40,44 @@ def start_workflow_by_name( if priority: kwargs.update({"priority": priority}) - return self.workflowResourceApi.start_workflow1(input, name, **kwargs) + self._record_workflow_input_payload_size(name, version, input) + try: + return self.workflowResourceApi.start_workflow1(input, name, **kwargs) + except Exception as e: + self._record_workflow_start_error(name, e) + raise def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: - return self.workflowResourceApi.start_workflow(start_workflow_request) + self._record_workflow_input_payload_size( + start_workflow_request.name, + start_workflow_request.version, + getattr(start_workflow_request, "input", None), + ) + try: + return self.workflowResourceApi.start_workflow(start_workflow_request) + except Exception as e: + self._record_workflow_start_error(start_workflow_request.name, e) + raise + + def _record_workflow_input_payload_size(self, name: str, version, workflow_input) -> None: + if self.metrics_collector is None: + return + try: + encoded = json.dumps(workflow_input if workflow_input is not None else {}, default=str) + size_bytes = len(encoded.encode("utf-8")) + self.metrics_collector.record_workflow_input_payload_size( + name, str(version) if version is not None else "", size_bytes, + ) + except Exception: + pass + + def _record_workflow_start_error(self, name: str, exception: Exception) -> None: + if self.metrics_collector is None: + return + try: + self.metrics_collector.increment_workflow_start_error(name, exception) + except Exception: + pass def execute_workflow( self, diff --git a/src/conductor/client/orkes_clients.py b/src/conductor/client/orkes_clients.py index 51078b195..50c773907 100644 --- a/src/conductor/client/orkes_clients.py +++ b/src/conductor/client/orkes_clients.py @@ -21,13 +21,14 @@ class OrkesClients: - def __init__(self, configuration: Configuration = None): + def __init__(self, configuration: Configuration = None, metrics_collector=None): if configuration is None: configuration = Configuration() self.configuration = configuration + self.metrics_collector = metrics_collector def get_workflow_client(self) -> WorkflowClient: - return OrkesWorkflowClient(self.configuration) + return OrkesWorkflowClient(self.configuration, metrics_collector=self.metrics_collector) def get_authorization_client(self) -> AuthorizationClient: return OrkesAuthorizationClient(self.configuration) @@ -48,7 +49,7 @@ def get_integration_client(self) -> IntegrationClient: return OrkesIntegrationClient(self.configuration) def get_workflow_executor(self) -> WorkflowExecutor: - return WorkflowExecutor(self.configuration) + return WorkflowExecutor(self.configuration, metrics_collector=self.metrics_collector) def get_prompt_client(self) -> PromptClient: return OrkesPromptClient(self.configuration) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py new file mode 100644 index 000000000..f77a8890e --- /dev/null +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -0,0 +1,215 @@ +""" +Canonical Prometheus metrics implementation following the harmonized metric +catalog (see sdk-metrics-harmonization.md). + +Uses real Prometheus Histograms for timing and size metrics, bounded-cardinality +exception labels, and canonical metric names (_total suffixed counters, _seconds +suffixed time histograms, _bytes suffixed size histograms). + +Events / metrics that have no canonical equivalent (legacy-only gauges) are +consumed as no-ops. This class is selected at runtime when +WORKER_CANONICAL_METRICS=true. +""" + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.metrics_collector_base import ( + MetricsCollectorBase, + _exception_label, +) +from conductor.client.telemetry.model.metric_documentation import MetricDocumentation +from conductor.client.telemetry.model.metric_label import MetricLabel +from conductor.client.telemetry.model.metric_name import MetricName + +TIME_BUCKETS = (0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0) +SIZE_BUCKETS = (100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000) + + +class CanonicalMetricsCollector(MetricsCollectorBase): + + def __init__(self, settings: MetricsSettings): + super().__init__(settings) + + # ------------------------------------------------------------------ + # Counters + # ------------------------------------------------------------------ + + def increment_task_poll(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_POLL, + documentation=MetricDocumentation.TASK_POLL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_poll_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_POLL_ERROR, + documentation=MetricDocumentation.TASK_POLL_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_task_execution_started(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTION_STARTED, + documentation=MetricDocumentation.TASK_EXECUTION_STARTED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_execution_queue_full(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTION_QUEUE_FULL, + documentation=MetricDocumentation.TASK_EXECUTION_QUEUE_FULL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_uncaught_exception(self, exception=None) -> None: + self._increment_counter( + name=MetricName.THREAD_UNCAUGHT_EXCEPTION, + documentation=MetricDocumentation.THREAD_UNCAUGHT_EXCEPTION, + labels={ + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_worker_restart(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.WORKER_RESTART, + documentation=MetricDocumentation.WORKER_RESTART, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_paused(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_PAUSED, + documentation=MetricDocumentation.TASK_PAUSED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_execution_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTE_ERROR, + documentation=MetricDocumentation.TASK_EXECUTE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_task_ack_failed(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_FAILED, + documentation=MetricDocumentation.TASK_ACK_FAILED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_ack_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_ERROR, + documentation=MetricDocumentation.TASK_ACK_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_task_update_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_UPDATE_ERROR, + documentation=MetricDocumentation.TASK_UPDATE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: + self._increment_counter( + name=MetricName.EXTERNAL_PAYLOAD_USED, + documentation=MetricDocumentation.EXTERNAL_PAYLOAD_USED, + labels={ + MetricLabel.ENTITY_NAME: entity_name, + MetricLabel.OPERATION: operation, + MetricLabel.PAYLOAD_TYPE: payload_type, + }, + ) + + def increment_workflow_start_error(self, workflow_type: str, exception) -> None: + self._increment_counter( + name=MetricName.WORKFLOW_START_ERROR, + documentation=MetricDocumentation.WORKFLOW_START_ERROR, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + # ------------------------------------------------------------------ + # Timing (real Prometheus Histograms) + # ------------------------------------------------------------------ + + def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._observe_histogram( + name=MetricName.TASK_POLL_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._observe_histogram( + name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._observe_histogram( + name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: + self._observe_histogram( + name=MetricName.API_REQUEST_TIME_CANONICAL, + documentation=MetricDocumentation.API_REQUEST_TIME_CANONICAL, + labels={ + MetricLabel.METHOD: method, + MetricLabel.URI: uri, + MetricLabel.STATUS: status, + }, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + # ------------------------------------------------------------------ + # Sizes (real Prometheus Histograms with size buckets) + # ------------------------------------------------------------------ + + def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: + self._observe_histogram( + name=MetricName.TASK_RESULT_SIZE_BYTES, + documentation=MetricDocumentation.TASK_RESULT_SIZE_BYTES, + labels={MetricLabel.TASK_TYPE: task_type}, + value=payload_size, + buckets=SIZE_BUCKETS, + ) + + def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: + self._observe_histogram( + name=MetricName.WORKFLOW_INPUT_SIZE_BYTES, + documentation=MetricDocumentation.WORKFLOW_INPUT_SIZE_BYTES, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.WORKFLOW_VERSION: version, + }, + value=payload_size, + buckets=SIZE_BUCKETS, + ) diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py new file mode 100644 index 000000000..e819810b9 --- /dev/null +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -0,0 +1,304 @@ +""" +Legacy Prometheus metrics implementation preserving the metric names, label +conventions, and quantile-gauge timing shape from the original python-sdk. + +Events / metrics that have no legacy equivalent are consumed as no-ops. +This class is selected at runtime when WORKER_CANONICAL_METRICS is not true. +""" + +from collections import deque +from typing import Any, Dict, List + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry import metrics_collector_base as _mcb +from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase +from conductor.client.telemetry.model.metric_documentation import MetricDocumentation +from conductor.client.telemetry.model.metric_label import MetricLabel +from conductor.client.telemetry.model.metric_name import MetricName + + +class LegacyMetricsCollector(MetricsCollectorBase): + + QUANTILE_WINDOW_SIZE = 1000 + + def __init__(self, settings: MetricsSettings): + super().__init__(settings) + self.quantile_metrics: Dict[str, Any] = {} + self.quantile_data: Dict[str, deque] = {} + + # ------------------------------------------------------------------ + # Counters + # ------------------------------------------------------------------ + + def increment_task_poll(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_POLL, + documentation=MetricDocumentation.TASK_POLL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_poll_error(self, task_type: str, exception) -> None: + pass # no legacy metric for poll errors + + def increment_task_execution_started(self, task_type: str) -> None: + pass # no legacy metric for execution-started + + def increment_task_execution_queue_full(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTION_QUEUE_FULL, + documentation=MetricDocumentation.TASK_EXECUTION_QUEUE_FULL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_uncaught_exception(self, exception=None) -> None: + self._increment_counter( + name=MetricName.THREAD_UNCAUGHT_EXCEPTION, + documentation=MetricDocumentation.THREAD_UNCAUGHT_EXCEPTION, + labels={}, + ) + + def increment_worker_restart(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.WORKER_RESTART, + documentation=MetricDocumentation.WORKER_RESTART, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_paused(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_PAUSED, + documentation=MetricDocumentation.TASK_PAUSED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_execution_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTE_ERROR, + documentation=MetricDocumentation.TASK_EXECUTE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + def increment_task_ack_failed(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_FAILED, + documentation=MetricDocumentation.TASK_ACK_FAILED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_ack_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_ERROR, + documentation=MetricDocumentation.TASK_ACK_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + def increment_task_update_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_UPDATE_ERROR, + documentation=MetricDocumentation.TASK_UPDATE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: + self._increment_counter( + name=MetricName.EXTERNAL_PAYLOAD_USED, + documentation=MetricDocumentation.EXTERNAL_PAYLOAD_USED, + labels={ + MetricLabel.ENTITY_NAME: entity_name, + MetricLabel.OPERATION: operation, + MetricLabel.PAYLOAD_TYPE_LEGACY: payload_type, + }, + ) + + def increment_workflow_start_error(self, workflow_type: str, exception) -> None: + self._increment_counter( + name=MetricName.WORKFLOW_START_ERROR, + documentation=MetricDocumentation.WORKFLOW_START_ERROR, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + # ------------------------------------------------------------------ + # Timing (last-value gauges + quantile gauges) + # ------------------------------------------------------------------ + + def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._record_gauge( + name=MetricName.TASK_POLL_TIME, + documentation=MetricDocumentation.TASK_POLL_TIME, + labels={MetricLabel.TASK_TYPE: task_type}, + value=time_spent, + ) + self._record_quantiles( + name=MetricName.TASK_POLL_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + ) + + def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._record_gauge( + name=MetricName.TASK_EXECUTE_TIME, + documentation=MetricDocumentation.TASK_EXECUTE_TIME, + labels={MetricLabel.TASK_TYPE: task_type}, + value=time_spent, + ) + self._record_quantiles( + name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + ) + + def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._record_quantiles( + name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + ) + + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: + self._record_quantiles( + name=MetricName.API_REQUEST_TIME, + documentation=MetricDocumentation.API_REQUEST_TIME, + labels={ + MetricLabel.METHOD: method, + MetricLabel.URI: uri, + MetricLabel.STATUS: status, + }, + value=time_spent, + ) + + # ------------------------------------------------------------------ + # Sizes (last-value gauges) + # ------------------------------------------------------------------ + + def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: + self._record_gauge( + name=MetricName.TASK_RESULT_SIZE, + documentation=MetricDocumentation.TASK_RESULT_SIZE, + labels={MetricLabel.TASK_TYPE: task_type}, + value=payload_size, + ) + + def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: + self._record_gauge( + name=MetricName.WORKFLOW_INPUT_SIZE, + documentation=MetricDocumentation.WORKFLOW_INPUT_SIZE, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.WORKFLOW_VERSION: version, + }, + value=payload_size, + ) + + # ------------------------------------------------------------------ + # Quantile-gauge machinery (legacy timing shape) + # ------------------------------------------------------------------ + + def _record_quantiles( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str], + value: float + ) -> None: + if not self.must_collect_metrics: + return + + with self._lock: + label_values = tuple(labels.values()) + data_key = f"{name}_{label_values}" + + if data_key not in self.quantile_data: + self.quantile_data[data_key] = deque(maxlen=self.QUANTILE_WINDOW_SIZE) + + self.quantile_data[data_key].append(value) + + observations = sorted(self.quantile_data[data_key]) + n = len(observations) + + if n > 0: + quantiles = [0.5, 0.75, 0.9, 0.95, 0.99] + for q in quantiles: + quantile_value = self._calculate_quantile(observations, q) + gauge = self._get_quantile_gauge( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()] + ["quantile"], + ) + gauge.labels(*labels.values(), str(q)).set(quantile_value) + + self._update_summary_aggregates( + name=name, + documentation=documentation, + labels=labels, + observations=list(self.quantile_data[data_key]), + ) + + @staticmethod + def _calculate_quantile(sorted_values: List[float], quantile: float) -> float: + if not sorted_values: + return 0.0 + n = len(sorted_values) + index = quantile * (n - 1) + if index.is_integer(): + return sorted_values[int(index)] + lower_index = int(index) + upper_index = min(lower_index + 1, n - 1) + fraction = index - lower_index + return sorted_values[lower_index] + fraction * (sorted_values[upper_index] - sorted_values[lower_index]) + + def _get_quantile_gauge(self, name, documentation, labelnames): + if name not in self.quantile_metrics: + _mcb._ensure_prometheus_imported() + self.quantile_metrics[name] = _mcb.Gauge( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry, + multiprocess_mode='all', + ) + return self.quantile_metrics[name] + + def _update_summary_aggregates(self, name, documentation, labels, observations): + if not observations: + return + _mcb._ensure_prometheus_imported() + base_name = name.value if hasattr(name, 'value') else str(name) + doc_str = documentation.value if hasattr(documentation, 'value') else str(documentation) + + count_name = f"{base_name}_count" + if count_name not in self.gauges: + self.gauges[count_name] = _mcb.Gauge( + name=count_name, + documentation=f"{doc_str} - count", + labelnames=[label.value for label in labels.keys()], + registry=self.registry, + multiprocess_mode='all', + ) + + sum_name = f"{base_name}_sum" + if sum_name not in self.gauges: + self.gauges[sum_name] = _mcb.Gauge( + name=sum_name, + documentation=f"{doc_str} - sum", + labelnames=[label.value for label in labels.keys()], + registry=self.registry, + multiprocess_mode='all', + ) + + self.gauges[count_name].labels(*labels.values()).set(len(observations)) + self.gauges[sum_name].labels(*labels.values()).set(sum(observations)) diff --git a/src/conductor/client/telemetry/metrics_collector.py b/src/conductor/client/telemetry/metrics_collector.py index 7e3b6c579..0773326d6 100644 --- a/src/conductor/client/telemetry/metrics_collector.py +++ b/src/conductor/client/telemetry/metrics_collector.py @@ -1,927 +1,38 @@ -import logging -import os -import threading -import time -from collections import deque -from typing import Any, ClassVar, Dict, List, Tuple +""" +Backward-compatibility shim. -# Lazy imports - these will be imported when first needed -# This is necessary for multiprocess mode where PROMETHEUS_MULTIPROC_DIR -# must be set before prometheus_client is imported -CollectorRegistry = None -Counter = None -Gauge = None -Histogram = None -Summary = None -write_to_textfile = None -MultiProcessCollector = None +Existing code that does:: -def _ensure_prometheus_imported(): - """Lazy import of prometheus_client to ensure PROMETHEUS_MULTIPROC_DIR is set first.""" - global CollectorRegistry, Counter, Gauge, Histogram, Summary, write_to_textfile, MultiProcessCollector + from conductor.client.telemetry.metrics_collector import MetricsCollector - if CollectorRegistry is None: - from prometheus_client import CollectorRegistry as _CollectorRegistry - from prometheus_client import Counter as _Counter - from prometheus_client import Gauge as _Gauge - from prometheus_client import Histogram as _Histogram - from prometheus_client import Summary as _Summary - from prometheus_client import write_to_textfile as _write_to_textfile - from prometheus_client.multiprocess import MultiProcessCollector as _MultiProcessCollector +will continue to work -- ``MetricsCollector`` is the legacy implementation. - CollectorRegistry = _CollectorRegistry - Counter = _Counter - Gauge = _Gauge - Histogram = _Histogram - Summary = _Summary - write_to_textfile = _write_to_textfile - MultiProcessCollector = _MultiProcessCollector +``task_handler.py`` accesses ``mc._ensure_prometheus_imported()``, +``mc.Counter``, ``mc.CollectorRegistry`` etc. via +``from conductor.client.telemetry import metrics_collector as mc``. +These are lazily-initialised module-level globals in the base module, +so we use ``__getattr__`` to forward attribute lookups dynamically +(a plain ``from … import Counter`` would capture ``None`` at import time). +""" -from conductor.client.configuration.configuration import Configuration -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.telemetry.model.metric_documentation import MetricDocumentation -from conductor.client.telemetry.model.metric_label import MetricLabel -from conductor.client.telemetry.model.metric_name import MetricName +# Re-export the legacy implementation under its original name. +from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector as MetricsCollector # noqa: F401 -# Event system imports (for new event-driven architecture) -from conductor.client.event.task_runner_events import ( - PollStarted, - PollCompleted, - PollFailure, - TaskExecutionStarted, - TaskExecutionCompleted, - TaskExecutionFailure, -) -from conductor.client.event.workflow_events import ( - WorkflowStarted, - WorkflowInputPayloadSize, - WorkflowPayloadUsed, -) -from conductor.client.event.task_events import ( - TaskResultPayloadSize, - TaskPayloadUsed, -) +from conductor.client.telemetry import metrics_collector_base as _base # noqa: F401 -logger = logging.getLogger( - Configuration.get_logging_formatted_name( - __name__ - ) -) +_FORWARDED = { + "_ensure_prometheus_imported", + "CollectorRegistry", + "Counter", + "Gauge", + "Histogram", + "Summary", + "write_to_textfile", + "MultiProcessCollector", +} -class MetricsCollector: - """ - Prometheus-based metrics collector for Conductor operations. - - This class implements the event listener protocols (TaskRunnerEventsListener, - WorkflowEventsListener, TaskEventsListener) via structural subtyping (duck typing), - matching the Java SDK's MetricsCollector interface. - - Supports both usage patterns: - 1. Direct method calls (backward compatible): - metrics.increment_task_poll(task_type) - - 2. Event-driven (new): - dispatcher.register(PollStarted, metrics.on_poll_started) - dispatcher.publish(PollStarted(...)) - - Thread Safety: - This class is thread-safe. Internal dictionaries (counters, gauges, histograms, etc.) - are protected by a lock to prevent race conditions when accessed from multiple threads - (e.g., worker threads and monitor threads). - - Note: Uses Python's Protocol for structural subtyping rather than explicit - inheritance to avoid circular imports and maintain backward compatibility. - """ - QUANTILE_WINDOW_SIZE = 1000 # Keep last 1000 observations for quantile calculation - - def __init__(self, settings: MetricsSettings): - # Instance state (avoid cross-test/dir interference from class-level caches). - self.counters: Dict[str, Counter] = {} - self.gauges: Dict[str, Gauge] = {} - self.histograms: Dict[str, Histogram] = {} - self.summaries: Dict[str, Summary] = {} - self.quantile_metrics: Dict[str, Gauge] = {} # metric_name -> Gauge with quantile label (used as summary) - self.quantile_data: Dict[str, deque] = {} # metric_name+labels -> deque of values - self.registry = None - self.must_collect_metrics = False - self._lock = threading.RLock() # Reentrant lock for thread-safe access to internal dictionaries - - if settings is None: - return - - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory - - # Import prometheus_client NOW (after PROMETHEUS_MULTIPROC_DIR is set). - _ensure_prometheus_imported() - - # Each MetricsCollector instance gets its own registry so callers/tests can - # safely use different PROMETHEUS_MULTIPROC_DIR values in the same process. - self.registry = CollectorRegistry() - - self.must_collect_metrics = True - logger.debug( - "MetricsCollector initialized with directory=%s, must_collect=%s", - settings.directory, - self.must_collect_metrics, - ) - - @staticmethod - def provide_metrics(settings: MetricsSettings) -> None: - if settings is None: - return - - # Set environment variable for this process - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory - - # Import prometheus_client in this process too (after setting env var) - _ensure_prometheus_imported() - - OUTPUT_FILE_PATH = os.path.join( - settings.directory, - settings.file_name - ) - - # Wait a bit for worker processes to start and create initial metrics - time.sleep(0.5) - - registry = CollectorRegistry() - # Use custom collector that removes pid label and aggregates across processes - from prometheus_client.multiprocess import MultiProcessCollector as MPCollector - from prometheus_client.samples import Sample - from prometheus_client.metrics_core import Metric - - class NoPidCollector(MPCollector): - """Custom collector that removes pid label and aggregates metrics across processes.""" - def collect(self): - for metric in super().collect(): - # Group samples by label set (excluding pid) - aggregated = {} - - for sample in metric.samples: - # Remove pid from labels - labels = {k: v for k, v in sample.labels.items() if k != 'pid'} - # Create key from sample name and labels - label_items = tuple(sorted(labels.items())) - key = (sample.name, label_items) - - if key not in aggregated: - aggregated[key] = { - 'labels': labels, - 'values': [], - 'name': sample.name, - 'timestamp': sample.timestamp, - 'exemplar': sample.exemplar - } - - aggregated[key]['values'].append(sample.value) - - # Create consolidated samples - filtered_samples = [] - for key, data in aggregated.items(): - # For counters and _count/_sum metrics: sum the values - # For gauges with quantiles: take the mean (approximation) - # For other gauges: take the last value - if metric.type == 'counter' or data['name'].endswith('_count') or data['name'].endswith('_sum'): - # Sum values for counters - value = sum(data['values']) - elif 'quantile' in data['labels']: - # For quantile metrics, take the mean across processes - value = sum(data['values']) / len(data['values']) - else: - # For other gauges, take the last value - value = data['values'][-1] - - filtered_samples.append( - Sample(data['name'], data['labels'], value, data['timestamp'], data['exemplar']) - ) - - # Create new metric and assign filtered samples - new_metric = Metric(metric.name, metric.documentation, metric.type) - new_metric.samples = filtered_samples - yield new_metric - - NoPidCollector(registry) - - # Start HTTP server if port is specified - http_server = None - if settings.http_port is not None: - http_server = MetricsCollector._start_http_server(settings.http_port, registry) - logger.info(f"Metrics HTTP server mode: serving from memory (no file writes) (pid={os.getpid()})") - - # When HTTP server is enabled, don't write to file - just keep updating registry in memory - # The HTTP server reads directly from the registry - while True: - time.sleep(settings.update_interval) - else: - # File-based mode: write metrics to file periodically - logger.info(f"Metrics file mode: writing to {OUTPUT_FILE_PATH} (pid={os.getpid()})") - while True: - try: - write_to_textfile( - OUTPUT_FILE_PATH, - registry - ) - except Exception as e: - # Log error but continue - metrics files might be in inconsistent state - logger.debug(f"Error writing metrics (will retry): {e}") - - time.sleep(settings.update_interval) - - @staticmethod - def _start_http_server(port: int, registry: 'CollectorRegistry') -> 'HTTPServer': - """Start HTTP server to expose metrics endpoint for Prometheus scraping.""" - from http.server import HTTPServer, BaseHTTPRequestHandler - import threading - - class MetricsHTTPHandler(BaseHTTPRequestHandler): - """HTTP handler to serve Prometheus metrics.""" - - def do_GET(self): - """Handle GET requests for /metrics endpoint.""" - if self.path == '/metrics': - try: - # Generate metrics in Prometheus text format - from prometheus_client import generate_latest - metrics_content = generate_latest(registry) - - # Send response - self.send_response(200) - self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8') - self.end_headers() - self.wfile.write(metrics_content) - - except Exception as e: - logger.error(f"Error serving metrics: {e}") - self.send_response(500) - self.send_header('Content-Type', 'text/plain') - self.end_headers() - self.wfile.write(f'Error: {str(e)}'.encode('utf-8')) - - elif self.path == '/' or self.path == '/health': - # Health check endpoint - self.send_response(200) - self.send_header('Content-Type', 'text/plain') - self.end_headers() - self.wfile.write(b'OK') - - else: - self.send_response(404) - self.send_header('Content-Type', 'text/plain') - self.end_headers() - self.wfile.write(b'Not Found - Try /metrics') - - def log_message(self, format, *args): - """Override to use our logger instead of stderr.""" - logger.debug(f"HTTP {self.address_string()} - {format % args}") - - server = HTTPServer(('', port), MetricsHTTPHandler) - logger.info(f"Started metrics HTTP server on port {port} (pid={os.getpid()})") - logger.info(f"Metrics available at: http://localhost:{port}/metrics") - - # Run server in daemon thread - server_thread = threading.Thread(target=server.serve_forever, daemon=True) - server_thread.start() - - return server - - def increment_task_poll(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_POLL, - documentation=MetricDocumentation.TASK_POLL, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_execution_queue_full(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_EXECUTION_QUEUE_FULL, - documentation=MetricDocumentation.TASK_EXECUTION_QUEUE_FULL, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_uncaught_exception(self): - self.__increment_counter( - name=MetricName.THREAD_UNCAUGHT_EXCEPTION, - documentation=MetricDocumentation.THREAD_UNCAUGHT_EXCEPTION, - labels={} - ) - - def increment_worker_restart(self, task_type: str) -> None: - """Incremented each time TaskHandler restarts a worker subprocess.""" - self.__increment_counter( - name=MetricName.WORKER_RESTART, - documentation=MetricDocumentation.WORKER_RESTART, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_poll_error(self, task_type: str, exception: Exception) -> None: - # No-op: Poll errors are already tracked via task_poll_time_seconds_count with status=FAILURE - pass - - def increment_task_paused(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_PAUSED, - documentation=MetricDocumentation.TASK_PAUSED, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_execution_error(self, task_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.TASK_EXECUTE_ERROR, - documentation=MetricDocumentation.TASK_EXECUTE_ERROR, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def increment_task_ack_failed(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_ACK_FAILED, - documentation=MetricDocumentation.TASK_ACK_FAILED, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_ack_error(self, task_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.TASK_ACK_ERROR, - documentation=MetricDocumentation.TASK_ACK_ERROR, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def increment_task_update_error(self, task_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.TASK_UPDATE_ERROR, - documentation=MetricDocumentation.TASK_UPDATE_ERROR, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: - self.__increment_counter( - name=MetricName.EXTERNAL_PAYLOAD_USED, - documentation=MetricDocumentation.EXTERNAL_PAYLOAD_USED, - labels={ - MetricLabel.ENTITY_NAME: entity_name, - MetricLabel.OPERATION: operation, - MetricLabel.PAYLOAD_TYPE: payload_type - } - ) - - def increment_workflow_start_error(self, workflow_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.WORKFLOW_START_ERROR, - documentation=MetricDocumentation.WORKFLOW_START_ERROR, - labels={ - MetricLabel.WORKFLOW_TYPE: workflow_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: - self.__record_gauge( - name=MetricName.WORKFLOW_INPUT_SIZE, - documentation=MetricDocumentation.WORKFLOW_INPUT_SIZE, - labels={ - MetricLabel.WORKFLOW_TYPE: workflow_type, - MetricLabel.WORKFLOW_VERSION: version - }, - value=payload_size - ) - - def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: - self.__record_gauge( - name=MetricName.TASK_RESULT_SIZE, - documentation=MetricDocumentation.TASK_RESULT_SIZE, - labels={ - MetricLabel.TASK_TYPE: task_type - }, - value=payload_size - ) - - def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - self.__record_gauge( - name=MetricName.TASK_POLL_TIME, - documentation=MetricDocumentation.TASK_POLL_TIME, - labels={ - MetricLabel.TASK_TYPE: task_type - }, - value=time_spent - ) - # Record as quantile gauges for percentile tracking - self.__record_quantiles( - name=MetricName.TASK_POLL_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - self.__record_gauge( - name=MetricName.TASK_EXECUTE_TIME, - documentation=MetricDocumentation.TASK_EXECUTE_TIME, - labels={ - MetricLabel.TASK_TYPE: task_type - }, - value=time_spent - ) - # Record as quantile gauges for percentile tracking - self.__record_quantiles( - name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_poll_time_histogram(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - """Record task poll time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.TASK_POLL_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_execute_time_histogram(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - """Record task execution time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_update_time_histogram(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - """Record task update time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: - """Record API request time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.API_REQUEST_TIME, - documentation=MetricDocumentation.API_REQUEST_TIME, - labels={ - MetricLabel.METHOD: method, - MetricLabel.URI: uri, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def __increment_counter( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str] - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - counter = self.__get_counter( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - counter.labels(*labels.values()).inc() - - def __record_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: Any - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - gauge = self.__get_gauge( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - gauge.labels(*labels.values()).set(value) - - def __get_counter( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Counter: - if name not in self.counters: - self.counters[name] = self.__generate_counter( - name, documentation, labelnames - ) - return self.counters[name] - - def __get_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Gauge: - if name not in self.gauges: - self.gauges[name] = self.__generate_gauge( - name, documentation, labelnames - ) - return self.gauges[name] - - def __generate_counter( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Counter: - return Counter( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry - ) - - def __generate_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Gauge: - return Gauge( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - def __observe_histogram( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: Any - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - histogram = self.__get_histogram( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - histogram.labels(*labels.values()).observe(value) - - def __get_histogram( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Histogram: - if name not in self.histograms: - self.histograms[name] = self.__generate_histogram( - name, documentation, labelnames - ) - return self.histograms[name] - - def __generate_histogram( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Histogram: - # Standard buckets for timing metrics: 1ms to 10s - return Histogram( - name=name, - documentation=documentation, - labelnames=labelnames, - buckets=(0.001, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0), - registry=self.registry - ) - - def __observe_summary( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: Any - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - summary = self.__get_summary( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - summary.labels(*labels.values()).observe(value) - - def __get_summary( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Summary: - if name not in self.summaries: - self.summaries[name] = self.__generate_summary( - name, documentation, labelnames - ) - return self.summaries[name] - - def __generate_summary( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Summary: - # Create summary metric - # Note: Prometheus Summary metrics provide count and sum by default - # For percentiles, use histogram buckets or calculate server-side - return Summary( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry - ) - - def __record_quantiles( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: float - ) -> None: - """ - Record a value and update quantile gauges (p50, p75, p90, p95, p99). - Also maintains _count and _sum for proper summary metrics. - - Maintains a sliding window of observations and calculates quantiles. - """ - if not self.must_collect_metrics: - return - - with self._lock: - # Create a key for this metric+labels combination - label_values = tuple(labels.values()) - data_key = f"{name}_{label_values}" - - # Initialize data window if needed - if data_key not in self.quantile_data: - self.quantile_data[data_key] = deque(maxlen=self.QUANTILE_WINDOW_SIZE) - - # Add new observation - self.quantile_data[data_key].append(value) - - # Calculate and update quantiles - observations = sorted(self.quantile_data[data_key]) - n = len(observations) - - if n > 0: - quantiles = [0.5, 0.75, 0.9, 0.95, 0.99] - for q in quantiles: - quantile_value = self.__calculate_quantile(observations, q) - - # Get or create gauge for this quantile - gauge = self.__get_quantile_gauge( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] + ["quantile"], - quantile=q - ) - - # Set gauge value with labels + quantile - gauge.labels(*labels.values(), str(q)).set(quantile_value) - - # Also publish _count and _sum for proper summary metrics - self.__update_summary_aggregates( - name=name, - documentation=documentation, - labels=labels, - observations=list(self.quantile_data[data_key]) - ) - - def __calculate_quantile(self, sorted_values: List[float], quantile: float) -> float: - """Calculate quantile from sorted list of values.""" - if not sorted_values: - return 0.0 - - n = len(sorted_values) - index = quantile * (n - 1) - - if index.is_integer(): - return sorted_values[int(index)] - else: - # Linear interpolation - lower_index = int(index) - upper_index = min(lower_index + 1, n - 1) - fraction = index - lower_index - return sorted_values[lower_index] + fraction * (sorted_values[upper_index] - sorted_values[lower_index]) - - def __get_quantile_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[str], - quantile: float - ) -> Gauge: - """Get or create a gauge for quantiles (single gauge with quantile label).""" - if name not in self.quantile_metrics: - # Create a single gauge with quantile as a label - # This gauge will be shared across all quantiles for this metric - # Note: In multiprocess mode, prometheus_client automatically adds 'pid' label - # We use multiprocess_mode='all' to aggregate across processes and remove pid - self.quantile_metrics[name] = Gauge( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - return self.quantile_metrics[name] - - def __update_summary_aggregates( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - observations: List[float] - ) -> None: - """ - Update _count and _sum gauges for proper summary metric format. - This makes the metrics compatible with Prometheus summary type. - - Note: This method should only be called while holding self._lock - (called from __record_quantiles which already holds the lock). - """ - if not observations: - return - - # Convert enum to string value - base_name = name.value if hasattr(name, 'value') else str(name) - - # Convert documentation enum to string - doc_str = documentation.value if hasattr(documentation, 'value') else str(documentation) - - # Get or create _count gauge - count_name = f"{base_name}_count" - if count_name not in self.gauges: - self.gauges[count_name] = Gauge( - name=count_name, - documentation=f"{doc_str} - count", - labelnames=[label.value for label in labels.keys()], - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - # Get or create _sum gauge - sum_name = f"{base_name}_sum" - if sum_name not in self.gauges: - self.gauges[sum_name] = Gauge( - name=sum_name, - documentation=f"{doc_str} - sum", - labelnames=[label.value for label in labels.keys()], - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - # Update values - self.gauges[count_name].labels(*labels.values()).set(len(observations)) - self.gauges[sum_name].labels(*labels.values()).set(sum(observations)) - - # ========================================================================= - # Event Listener Protocol Implementation (TaskRunnerEventsListener) - # ========================================================================= - # These methods allow MetricsCollector to be used as an event listener - # in the new event-driven architecture, while maintaining backward - # compatibility with existing direct method calls. - - def on_poll_started(self, event: PollStarted) -> None: - """ - Handle poll started event. - Maps to increment_task_poll() for backward compatibility. - """ - self.increment_task_poll(event.task_type) - - def on_poll_completed(self, event: PollCompleted) -> None: - """ - Handle poll completed event. - Maps to record_task_poll_time() for backward compatibility. - """ - self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") - - def on_poll_failure(self, event: PollFailure) -> None: - """ - Handle poll failure event. - Maps to increment_task_poll_error() for backward compatibility. - Also records poll time with FAILURE status. - """ - self.increment_task_poll_error(event.task_type, event.cause) - # Record poll time with failure status if duration is available - if hasattr(event, 'duration_ms') and event.duration_ms is not None: - self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="FAILURE") - - def on_task_execution_started(self, event: TaskExecutionStarted) -> None: - """ - Handle task execution started event. - No direct metric equivalent in old system - could be used for - tracking in-flight tasks in the future. - """ - pass # No corresponding metric in existing system - - def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: - """ - Handle task execution completed event. - Maps to record_task_execute_time() and record_task_result_payload_size(). - """ - self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") - if event.output_size_bytes is not None: - self.record_task_result_payload_size(event.task_type, event.output_size_bytes) - - def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: - """ - Handle task execution failure event. - Maps to increment_task_execution_error() for backward compatibility. - Also records execution time with FAILURE status. - """ - self.increment_task_execution_error(event.task_type, event.cause) - # Record execution time with failure status if duration is available - if hasattr(event, 'duration_ms') and event.duration_ms is not None: - self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="FAILURE") - - # ========================================================================= - # Event Listener Protocol Implementation (WorkflowEventsListener) - # ========================================================================= - - def on_workflow_started(self, event: WorkflowStarted) -> None: - """ - Handle workflow started event. - Maps to increment_workflow_start_error() if workflow failed to start. - """ - if not event.success and event.cause is not None: - self.increment_workflow_start_error(event.name, event.cause) - - def on_workflow_input_payload_size(self, event: WorkflowInputPayloadSize) -> None: - """ - Handle workflow input payload size event. - Maps to record_workflow_input_payload_size(). - """ - version_str = str(event.version) if event.version is not None else "1" - self.record_workflow_input_payload_size(event.name, version_str, event.size_bytes) - - def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: - """ - Handle workflow external payload usage event. - Maps to increment_external_payload_used(). - """ - self.increment_external_payload_used(event.name, event.operation, event.payload_type) - - # ========================================================================= - # Event Listener Protocol Implementation (TaskEventsListener) - # ========================================================================= - - def on_task_result_payload_size(self, event: TaskResultPayloadSize) -> None: - """ - Handle task result payload size event. - Maps to record_task_result_payload_size(). - """ - self.record_task_result_payload_size(event.task_type, event.size_bytes) - - def on_task_payload_used(self, event: TaskPayloadUsed) -> None: - """ - Handle task external payload usage event. - Maps to increment_external_payload_used(). - """ - self.increment_external_payload_used(event.task_type, event.operation, event.payload_type) +def __getattr__(name): + if name in _FORWARDED: + return getattr(_base, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py new file mode 100644 index 000000000..edb72d6a5 --- /dev/null +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -0,0 +1,467 @@ +import abc +import logging +import os +import signal +import threading +import time +from typing import Any, Dict, List + +# Lazy imports - these will be imported when first needed. +# PROMETHEUS_MULTIPROC_DIR must be set before prometheus_client is imported. +CollectorRegistry = None +Counter = None +Gauge = None +Histogram = None +Summary = None +write_to_textfile = None +MultiProcessCollector = None + + +def _ensure_prometheus_imported(): + """Lazy import of prometheus_client to ensure PROMETHEUS_MULTIPROC_DIR is set first.""" + global CollectorRegistry, Counter, Gauge, Histogram, Summary, write_to_textfile, MultiProcessCollector + + if CollectorRegistry is None: + from prometheus_client import CollectorRegistry as _CollectorRegistry + from prometheus_client import Counter as _Counter + from prometheus_client import Gauge as _Gauge + from prometheus_client import Histogram as _Histogram + from prometheus_client import Summary as _Summary + from prometheus_client import write_to_textfile as _write_to_textfile + from prometheus_client.multiprocess import MultiProcessCollector as _MultiProcessCollector + + CollectorRegistry = _CollectorRegistry + Counter = _Counter + Gauge = _Gauge + Histogram = _Histogram + Summary = _Summary + write_to_textfile = _write_to_textfile + MultiProcessCollector = _MultiProcessCollector + + +def _exception_label(exception) -> str: + """ + Return a bounded-cardinality label value for an exception. + + Reduces to the bare class name (mirroring the Java / Go SDK convention) + to prevent unbounded label cardinality from error messages. + """ + if exception is None: + return "None" + if isinstance(exception, type): + return exception.__name__ + if isinstance(exception, BaseException): + return type(exception).__name__ + return str(exception) + + +from conductor.client.configuration.configuration import Configuration +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.model.metric_documentation import MetricDocumentation +from conductor.client.telemetry.model.metric_label import MetricLabel +from conductor.client.telemetry.model.metric_name import MetricName + +from conductor.client.event.task_runner_events import ( + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, +) +from conductor.client.event.workflow_events import ( + WorkflowStarted, + WorkflowInputPayloadSize, + WorkflowPayloadUsed, +) +from conductor.client.event.task_events import ( + TaskResultPayloadSize, + TaskPayloadUsed, +) + +logger = logging.getLogger( + Configuration.get_logging_formatted_name( + __name__ + ) +) + + +class MetricsCollectorBase(abc.ABC): + """ + Abstract base class for Conductor metrics collectors. + + Provides shared Prometheus infrastructure (registry, lazy imports, multiprocess + aggregation, HTTP server) and concrete event-handler implementations that + delegate to abstract metric methods. Subclasses implement the abstract methods + to emit either legacy or canonical metric shapes. + + Satisfies the MetricsCollector Protocol in event/listeners.py via duck typing. + """ + + def __init__(self, settings: MetricsSettings): + self.counters: Dict[str, Any] = {} + self.gauges: Dict[str, Any] = {} + self.histograms: Dict[str, Any] = {} + self.summaries: Dict[str, Any] = {} + self.registry = None + self.must_collect_metrics = False + self._lock = threading.RLock() + + if settings is None: + return + + os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory + + _ensure_prometheus_imported() + + self.registry = CollectorRegistry() + + self.must_collect_metrics = True + logger.debug( + "MetricsCollector initialized with directory=%s, must_collect=%s", + settings.directory, + self.must_collect_metrics, + ) + + # ========================================================================= + # Static: Multiprocess metrics aggregation + HTTP serving + # ========================================================================= + + @staticmethod + def provide_metrics(settings: MetricsSettings) -> None: + if settings is None: + return + + # Ignore SIGINT in this subprocess -- Ctrl-C is handled by the parent + # via SIGTERM from TaskHandler.stop_processes(). Without this, Ctrl-C + # produces noisy KeyboardInterrupt tracebacks from time.sleep(). + try: + signal.signal(signal.SIGINT, signal.SIG_IGN) + except Exception: + pass + + os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory + + _ensure_prometheus_imported() + + OUTPUT_FILE_PATH = os.path.join( + settings.directory, + settings.file_name + ) + + time.sleep(0.5) + + registry = CollectorRegistry() + from prometheus_client.multiprocess import MultiProcessCollector as MPCollector + from prometheus_client.samples import Sample + from prometheus_client.metrics_core import Metric + + class NoPidCollector(MPCollector): + """Custom collector that removes pid label and aggregates metrics across processes.""" + def collect(self): + for metric in super().collect(): + aggregated = {} + + for sample in metric.samples: + labels = {k: v for k, v in sample.labels.items() if k != 'pid'} + label_items = tuple(sorted(labels.items())) + key = (sample.name, label_items) + + if key not in aggregated: + aggregated[key] = { + 'labels': labels, + 'values': [], + 'name': sample.name, + 'timestamp': sample.timestamp, + 'exemplar': sample.exemplar + } + + aggregated[key]['values'].append(sample.value) + + filtered_samples = [] + for key, data in aggregated.items(): + if metric.type == 'counter' or data['name'].endswith('_count') or data['name'].endswith('_sum'): + value = sum(data['values']) + elif metric.type == 'histogram' or '_bucket' in data['name']: + value = sum(data['values']) + elif 'quantile' in data['labels']: + value = sum(data['values']) / len(data['values']) + else: + value = data['values'][-1] + + filtered_samples.append( + Sample(data['name'], data['labels'], value, data['timestamp'], data['exemplar']) + ) + + new_metric = Metric(metric.name, metric.documentation, metric.type) + new_metric.samples = filtered_samples + yield new_metric + + NoPidCollector(registry) + + http_server = None + if settings.http_port is not None: + http_server = MetricsCollectorBase._start_http_server(settings.http_port, registry) + logger.info(f"Metrics HTTP server mode: serving from memory (no file writes) (pid={os.getpid()})") + + while True: + time.sleep(settings.update_interval) + else: + logger.info(f"Metrics file mode: writing to {OUTPUT_FILE_PATH} (pid={os.getpid()})") + while True: + try: + write_to_textfile( + OUTPUT_FILE_PATH, + registry + ) + except Exception as e: + logger.debug(f"Error writing metrics (will retry): {e}") + + time.sleep(settings.update_interval) + + @staticmethod + def _start_http_server(port: int, registry) -> 'HTTPServer': + """Start HTTP server to expose metrics endpoint for Prometheus scraping.""" + from http.server import HTTPServer, BaseHTTPRequestHandler + import threading as _threading + + class MetricsHTTPHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/metrics': + try: + from prometheus_client import generate_latest + metrics_content = generate_latest(registry) + + self.send_response(200) + self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8') + self.end_headers() + self.wfile.write(metrics_content) + + except Exception as e: + logger.error(f"Error serving metrics: {e}") + self.send_response(500) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(f'Error: {str(e)}'.encode('utf-8')) + + elif self.path == '/' or self.path == '/health': + self.send_response(200) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(b'OK') + + else: + self.send_response(404) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(b'Not Found - Try /metrics') + + def log_message(self, format, *args): + logger.debug(f"HTTP {self.address_string()} - {format % args}") + + server = HTTPServer(('', port), MetricsHTTPHandler) + logger.info(f"Started metrics HTTP server on port {port} (pid={os.getpid()})") + logger.info(f"Metrics available at: http://localhost:{port}/metrics") + + server_thread = _threading.Thread(target=server.serve_forever, daemon=True) + server_thread.start() + + return server + + # ========================================================================= + # Shared Prometheus helpers (used by subclasses) + # ========================================================================= + + def _increment_counter( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str] + ) -> None: + if not self.must_collect_metrics: + return + with self._lock: + counter = self._get_counter( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()] + ) + counter.labels(*labels.values()).inc() + + def _record_gauge( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str], + value: Any + ) -> None: + if not self.must_collect_metrics: + return + with self._lock: + gauge = self._get_gauge( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()] + ) + gauge.labels(*labels.values()).set(value) + + def _observe_histogram( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str], + value: Any, + buckets=None + ) -> None: + if not self.must_collect_metrics: + return + with self._lock: + histogram = self._get_histogram( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()], + buckets=buckets + ) + histogram.labels(*labels.values()).observe(value) + + def _get_counter(self, name, documentation, labelnames): + if name not in self.counters: + self.counters[name] = Counter( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry + ) + return self.counters[name] + + def _get_gauge(self, name, documentation, labelnames): + if name not in self.gauges: + self.gauges[name] = Gauge( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry, + multiprocess_mode='all' + ) + return self.gauges[name] + + def _get_histogram(self, name, documentation, labelnames, buckets=None): + if name not in self.histograms: + kwargs = dict( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry, + ) + if buckets is not None: + kwargs['buckets'] = buckets + self.histograms[name] = Histogram(**kwargs) + return self.histograms[name] + + # ========================================================================= + # Abstract metric methods -- the full union surface. + # Subclasses implement each with real logic or pass (no-op). + # ========================================================================= + + @abc.abstractmethod + def increment_task_poll(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_poll_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_task_execution_started(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_execution_queue_full(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_uncaught_exception(self, exception=None) -> None: ... + + @abc.abstractmethod + def increment_worker_restart(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_paused(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_execution_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_task_ack_failed(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_ack_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_task_update_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: ... + + @abc.abstractmethod + def increment_workflow_start_error(self, workflow_type: str, exception) -> None: ... + + @abc.abstractmethod + def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... + + @abc.abstractmethod + def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... + + @abc.abstractmethod + def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... + + @abc.abstractmethod + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: ... + + @abc.abstractmethod + def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: ... + + @abc.abstractmethod + def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: ... + + # ========================================================================= + # Concrete event handlers -- delegate to the abstract metric methods. + # These satisfy the event listener protocols in event/listeners.py. + # ========================================================================= + + def on_poll_started(self, event: PollStarted) -> None: + self.increment_task_poll(event.task_type) + + def on_poll_completed(self, event: PollCompleted) -> None: + self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") + + def on_poll_failure(self, event: PollFailure) -> None: + self.increment_task_poll_error(event.task_type, event.cause) + if hasattr(event, 'duration_ms') and event.duration_ms is not None: + self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="FAILURE") + + def on_task_execution_started(self, event: TaskExecutionStarted) -> None: + self.increment_task_execution_started(event.task_type) + + def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: + self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") + if event.output_size_bytes is not None: + self.record_task_result_payload_size(event.task_type, event.output_size_bytes) + + def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: + self.increment_task_execution_error(event.task_type, event.cause) + if hasattr(event, 'duration_ms') and event.duration_ms is not None: + self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="FAILURE") + + def on_workflow_started(self, event: WorkflowStarted) -> None: + if not event.success and event.cause is not None: + self.increment_workflow_start_error(event.name, event.cause) + + def on_workflow_input_payload_size(self, event: WorkflowInputPayloadSize) -> None: + version_str = str(event.version) if event.version is not None else "1" + self.record_workflow_input_payload_size(event.name, version_str, event.size_bytes) + + def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: + self.increment_external_payload_used(event.name, event.operation, event.payload_type) + + def on_task_result_payload_size(self, event: TaskResultPayloadSize) -> None: + self.record_task_result_payload_size(event.task_type, event.size_bytes) + + def on_task_payload_used(self, event: TaskPayloadUsed) -> None: + self.increment_external_payload_used(event.task_type, event.operation, event.payload_type) diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py new file mode 100644 index 000000000..3b3e4187c --- /dev/null +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -0,0 +1,45 @@ +""" +Factory that selects the correct MetricsCollector implementation based on +environment variables. + + WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector + WORKER_LEGACY_METRICS=true -> LegacyMetricsCollector (default during deprecation) + +If WORKER_CANONICAL_METRICS is true it takes priority regardless of the value +of WORKER_LEGACY_METRICS. +""" + +import logging +import os + +from conductor.client.configuration.configuration import Configuration +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase + +logger = logging.getLogger( + Configuration.get_logging_formatted_name(__name__) +) + + +def _env_bool(name: str, default: bool) -> bool: + value = os.environ.get(name, "") + if not value: + return default + return value.strip().lower() in ("true", "1", "yes") + + +def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: + """ + Create the metrics collector selected by environment variables. + + Returns a fully-initialised collector (legacy or canonical) that satisfies + the MetricsCollector Protocol and can be registered as an event listener. + """ + if _env_bool("WORKER_CANONICAL_METRICS", default=False): + from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector + logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector") + return CanonicalMetricsCollector(settings) + + from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector + logger.info("Using LegacyMetricsCollector (set WORKER_CANONICAL_METRICS=true for canonical metrics)") + return LegacyMetricsCollector(settings) diff --git a/src/conductor/client/telemetry/model/metric_documentation.py b/src/conductor/client/telemetry/model/metric_documentation.py index d76ad063c..47d8c2736 100644 --- a/src/conductor/client/telemetry/model/metric_documentation.py +++ b/src/conductor/client/telemetry/model/metric_documentation.py @@ -2,6 +2,7 @@ class MetricDocumentation(str, Enum): + # Shared / legacy documentation strings API_REQUEST_TIME = "API request duration in seconds with quantiles" EXTERNAL_PAYLOAD_USED = "Incremented each time external payload storage is used" TASK_ACK_ERROR = "Task ack has encountered an exception" @@ -21,3 +22,11 @@ class MetricDocumentation(str, Enum): WORKER_RESTART = "Worker subprocess restarted" WORKFLOW_START_ERROR = "Counter for workflow start errors" WORKFLOW_INPUT_SIZE = "Records input payload size of a workflow" + + # Canonical-only documentation strings + API_REQUEST_TIME_CANONICAL = "Latency of HTTP API client requests in seconds" + TASK_EXECUTION_STARTED = "Incremented when a polled task is dispatched to the worker function" + TASK_POLL_ERROR = "Incremented when a poll request fails client-side" + TASK_RESULT_SIZE_BYTES = "Serialized byte size of task result output" + WORKFLOW_INPUT_SIZE_BYTES = "Serialized byte size of workflow input" + ACTIVE_WORKERS = "Current number of worker threads actively executing a task" diff --git a/src/conductor/client/telemetry/model/metric_label.py b/src/conductor/client/telemetry/model/metric_label.py index 7aeae21ef..8d4a63f70 100644 --- a/src/conductor/client/telemetry/model/metric_label.py +++ b/src/conductor/client/telemetry/model/metric_label.py @@ -6,7 +6,8 @@ class MetricLabel(str, Enum): EXCEPTION = "exception" METHOD = "method" OPERATION = "operation" - PAYLOAD_TYPE = "payload_type" + PAYLOAD_TYPE = "payloadType" + PAYLOAD_TYPE_LEGACY = "payload_type" STATUS = "status" TASK_TYPE = "taskType" URI = "uri" diff --git a/src/conductor/client/telemetry/model/metric_name.py b/src/conductor/client/telemetry/model/metric_name.py index 89810865d..d8485a267 100644 --- a/src/conductor/client/telemetry/model/metric_name.py +++ b/src/conductor/client/telemetry/model/metric_name.py @@ -2,6 +2,7 @@ class MetricName(str, Enum): + # Legacy metric names (pre-harmonization) API_REQUEST_TIME = "http_api_client_request" EXTERNAL_PAYLOAD_USED = "external_payload_used" TASK_ACK_ERROR = "task_ack_error" @@ -21,3 +22,11 @@ class MetricName(str, Enum): WORKER_RESTART = "worker_restart" WORKFLOW_INPUT_SIZE = "workflow_input_size" WORKFLOW_START_ERROR = "workflow_start_error" + + # Canonical-only metric names (harmonization additions) + API_REQUEST_TIME_CANONICAL = "http_api_client_request_seconds" + TASK_EXECUTION_STARTED = "task_execution_started" + TASK_POLL_ERROR = "task_poll_error" + TASK_RESULT_SIZE_BYTES = "task_result_size_bytes" + WORKFLOW_INPUT_SIZE_BYTES = "workflow_input_size_bytes" + ACTIVE_WORKERS = "active_workers" diff --git a/src/conductor/client/workflow/executor/workflow_executor.py b/src/conductor/client/workflow/executor/workflow_executor.py index 20074de09..c8ff1f939 100644 --- a/src/conductor/client/workflow/executor/workflow_executor.py +++ b/src/conductor/client/workflow/executor/workflow_executor.py @@ -27,11 +27,11 @@ class WorkflowExecutor: - def __init__(self, configuration: Configuration) -> Self: - api_client = ApiClient(configuration) + def __init__(self, configuration: Configuration, metrics_collector=None) -> Self: + api_client = ApiClient(configuration, metrics_collector=metrics_collector) self.metadata_client = MetadataResourceApi(api_client) self.task_client = TaskResourceApi(api_client) - self.workflow_client = OrkesWorkflowClient(configuration) + self.workflow_client = OrkesWorkflowClient(configuration, metrics_collector=metrics_collector) def register_workflow(self, workflow: WorkflowDef, overwrite: Optional[bool] = None) -> object: """Create a new workflow definition""" diff --git a/tests/unit/telemetry/test_canonical_metrics_collector.py b/tests/unit/telemetry/test_canonical_metrics_collector.py new file mode 100644 index 000000000..16983ae24 --- /dev/null +++ b/tests/unit/telemetry/test_canonical_metrics_collector.py @@ -0,0 +1,216 @@ +""" +Tests for CanonicalMetricsCollector. + +Verifies that canonical metric names, types, labels, and bucket sets are +correct per the sdk-metrics-harmonization spec. +""" + +import os +import shutil +import tempfile +import unittest + +from prometheus_client import write_to_textfile + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector +from conductor.client.telemetry.metrics_collector_base import _exception_label +from conductor.client.event.task_runner_events import ( + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, +) + + +class TestCanonicalMetricsCollector(unittest.TestCase): + + def setUp(self): + self.metrics_dir = tempfile.mkdtemp() + self.settings = MetricsSettings(directory=self.metrics_dir) + self.collector = CanonicalMetricsCollector(self.settings) + + def tearDown(self): + if os.path.exists(self.metrics_dir): + shutil.rmtree(self.metrics_dir) + + def _get_metrics_text(self): + path = os.path.join(self.metrics_dir, "test_out.prom") + write_to_textfile(path, self.collector.registry) + with open(path) as f: + return f.read() + + # ------------------------------------------------------------------ + # Counters + # ------------------------------------------------------------------ + + def test_task_poll_counter(self): + self.collector.increment_task_poll("my_task") + text = self._get_metrics_text() + self.assertIn('task_poll_total{taskType="my_task"}', text) + + def test_task_poll_error_counter(self): + self.collector.increment_task_poll_error("my_task", RuntimeError("oops")) + text = self._get_metrics_text() + self.assertIn("task_poll_error_total", text) + self.assertIn('exception="RuntimeError"', text) + + def test_task_execution_started_counter(self): + self.collector.increment_task_execution_started("my_task") + text = self._get_metrics_text() + self.assertIn('task_execution_started_total{taskType="my_task"}', text) + + def test_uncaught_exception_with_label(self): + self.collector.increment_uncaught_exception(ValueError("bad")) + text = self._get_metrics_text() + self.assertIn("thread_uncaught_exceptions_total", text) + self.assertIn('exception="ValueError"', text) + + def test_external_payload_uses_camelcase_label(self): + self.collector.increment_external_payload_used("ent", "READ", "TASK_INPUT") + text = self._get_metrics_text() + self.assertIn("external_payload_used_total", text) + self.assertIn('payloadType="TASK_INPUT"', text) + + def test_exception_label_uses_class_name(self): + self.collector.increment_task_execution_error("t", ValueError("x")) + text = self._get_metrics_text() + self.assertIn('exception="ValueError"', text) + + def test_workflow_start_error(self): + self.collector.increment_workflow_start_error("wf", Exception("fail")) + text = self._get_metrics_text() + self.assertIn("workflow_start_error_total", text) + self.assertIn('exception="Exception"', text) + + # ------------------------------------------------------------------ + # Time Histograms + # ------------------------------------------------------------------ + + def test_task_poll_time_is_histogram(self): + self.collector.record_task_poll_time("my_task", 0.05, status="SUCCESS") + text = self._get_metrics_text() + self.assertIn("task_poll_time_seconds_bucket", text) + self.assertIn("task_poll_time_seconds_count", text) + self.assertIn("task_poll_time_seconds_sum", text) + self.assertIn('le="0.1"', text) + + def test_task_execute_time_is_histogram(self): + self.collector.record_task_execute_time("my_task", 0.5, status="SUCCESS") + text = self._get_metrics_text() + self.assertIn("task_execute_time_seconds_bucket", text) + + def test_task_update_time_is_histogram(self): + self.collector.record_task_update_time("my_task", 0.2, status="FAILURE") + text = self._get_metrics_text() + self.assertIn("task_update_time_seconds_bucket", text) + self.assertIn('status="FAILURE"', text) + + def test_api_request_time_canonical_name(self): + self.collector.record_api_request_time("GET", "/api/tasks", "200", 0.1) + text = self._get_metrics_text() + self.assertIn("http_api_client_request_seconds_bucket", text) + self.assertIn("http_api_client_request_seconds_count", text) + + def test_time_histogram_bucket_set(self): + """Canonical time histograms use the spec bucket set.""" + self.collector.record_task_poll_time("my_task", 0.001) + text = self._get_metrics_text() + for boundary in ("0.001", "0.005", "0.01", "0.025", "0.05", "0.1", + "0.25", "0.5", "1.0", "2.5", "5.0", "10.0"): + self.assertIn(f'le="{boundary}"', text) + + # ------------------------------------------------------------------ + # Size Histograms + # ------------------------------------------------------------------ + + def test_task_result_size_bytes_histogram(self): + self.collector.record_task_result_payload_size("my_task", 5000) + text = self._get_metrics_text() + self.assertIn("task_result_size_bytes_bucket", text) + self.assertIn("task_result_size_bytes_count", text) + + def test_workflow_input_size_bytes_histogram(self): + self.collector.record_workflow_input_payload_size("wf", "1", 50000) + text = self._get_metrics_text() + self.assertIn("workflow_input_size_bytes_bucket", text) + + def test_size_histogram_bucket_set(self): + """Canonical size histograms use the spec bucket set.""" + self.collector.record_task_result_payload_size("t", 500) + text = self._get_metrics_text() + # prometheus_client uses scientific notation for large values + for boundary in ("100.0", "1000.0", "10000.0", "100000.0", + "1e+06", "1e+07"): + self.assertIn(f'le="{boundary}"', text) + + # ------------------------------------------------------------------ + # Event handler integration + # ------------------------------------------------------------------ + + def test_event_poll_started_increments_counter(self): + event = PollStarted(task_type="t", worker_id="w", poll_count=1) + self.collector.on_poll_started(event) + text = self._get_metrics_text() + self.assertIn('task_poll_total{taskType="t"}', text) + + def test_event_poll_completed_records_histogram(self): + event = PollCompleted(task_type="t", duration_ms=100.0, tasks_received=1) + self.collector.on_poll_completed(event) + text = self._get_metrics_text() + self.assertIn("task_poll_time_seconds_bucket", text) + + def test_event_poll_failure_records_error_and_histogram(self): + event = PollFailure(task_type="t", duration_ms=50.0, cause=RuntimeError("x")) + self.collector.on_poll_failure(event) + text = self._get_metrics_text() + self.assertIn("task_poll_error_total", text) + self.assertIn("task_poll_time_seconds_bucket", text) + + def test_event_execution_started(self): + event = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(event) + text = self._get_metrics_text() + self.assertIn("task_execution_started_total", text) + + def test_event_execution_completed_records_histogram_and_size(self): + event = TaskExecutionCompleted( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", duration_ms=200.0, output_size_bytes=1024, + ) + self.collector.on_task_execution_completed(event) + text = self._get_metrics_text() + self.assertIn("task_execute_time_seconds_bucket", text) + self.assertIn("task_result_size_bytes_bucket", text) + + def test_event_execution_failure_records_counter_and_histogram(self): + event = TaskExecutionFailure( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", cause=ValueError("bad"), duration_ms=100.0, + ) + self.collector.on_task_execution_failure(event) + text = self._get_metrics_text() + self.assertIn("task_execute_error_total", text) + self.assertIn("task_execute_time_seconds_bucket", text) + + +class TestExceptionLabel(unittest.TestCase): + """Test the _exception_label helper.""" + + def test_none(self): + self.assertEqual(_exception_label(None), "None") + + def test_exception_instance(self): + self.assertEqual(_exception_label(ValueError("x")), "ValueError") + + def test_exception_class(self): + self.assertEqual(_exception_label(RuntimeError), "RuntimeError") + + def test_string_passthrough(self): + self.assertEqual(_exception_label("SomeError"), "SomeError") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py new file mode 100644 index 000000000..586047aa5 --- /dev/null +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -0,0 +1,117 @@ +""" +Tests for the metrics factory and gated metrics selection. +""" + +import os +import shutil +import tempfile +import unittest + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.metrics_factory import create_metrics_collector +from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector +from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector + + +class TestMetricsFactory(unittest.TestCase): + + def setUp(self): + self.metrics_dir = tempfile.mkdtemp() + self.settings = MetricsSettings(directory=self.metrics_dir) + self._saved_env = {} + for key in ("WORKER_CANONICAL_METRICS", "WORKER_LEGACY_METRICS"): + self._saved_env[key] = os.environ.pop(key, None) + + def tearDown(self): + for key, val in self._saved_env.items(): + if val is None: + os.environ.pop(key, None) + else: + os.environ[key] = val + if os.path.exists(self.metrics_dir): + shutil.rmtree(self.metrics_dir) + + def test_default_returns_legacy(self): + """With no env vars set, factory returns LegacyMetricsCollector.""" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, LegacyMetricsCollector) + + def test_canonical_true_returns_canonical(self): + """WORKER_CANONICAL_METRICS=true selects CanonicalMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "true" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + + def test_canonical_1_returns_canonical(self): + """WORKER_CANONICAL_METRICS=1 selects CanonicalMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "1" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + + def test_canonical_false_returns_legacy(self): + """WORKER_CANONICAL_METRICS=false selects LegacyMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "false" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, LegacyMetricsCollector) + + def test_canonical_takes_priority_over_legacy(self): + """WORKER_CANONICAL_METRICS=true wins even if WORKER_LEGACY_METRICS=true.""" + os.environ["WORKER_CANONICAL_METRICS"] = "true" + os.environ["WORKER_LEGACY_METRICS"] = "true" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + + def test_both_implementations_satisfy_same_interface(self): + """Both implementations have the same public method surface.""" + legacy = LegacyMetricsCollector(self.settings) + os.environ["WORKER_CANONICAL_METRICS"] = "true" + canonical = create_metrics_collector( + MetricsSettings(directory=tempfile.mkdtemp()) + ) + + required_methods = [ + "increment_task_poll", + "increment_task_poll_error", + "increment_task_execution_started", + "increment_task_execution_queue_full", + "increment_uncaught_exception", + "increment_worker_restart", + "increment_task_paused", + "increment_task_execution_error", + "increment_task_ack_failed", + "increment_task_ack_error", + "increment_task_update_error", + "increment_external_payload_used", + "increment_workflow_start_error", + "record_task_poll_time", + "record_task_execute_time", + "record_task_update_time", + "record_api_request_time", + "record_task_result_payload_size", + "record_workflow_input_payload_size", + "on_poll_started", + "on_poll_completed", + "on_poll_failure", + "on_task_execution_started", + "on_task_execution_completed", + "on_task_execution_failure", + "on_workflow_started", + "on_workflow_input_payload_size", + "on_workflow_payload_used", + "on_task_result_payload_size", + "on_task_payload_used", + ] + + for method_name in required_methods: + self.assertTrue( + hasattr(legacy, method_name) and callable(getattr(legacy, method_name)), + f"LegacyMetricsCollector missing method: {method_name}", + ) + self.assertTrue( + hasattr(canonical, method_name) and callable(getattr(canonical, method_name)), + f"CanonicalMetricsCollector missing method: {method_name}", + ) + + +if __name__ == "__main__": + unittest.main() From 9d671cafeae9c3f115a1c01d84dda04a3e79a01d Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 11:24:07 -0600 Subject: [PATCH 02/24] validate the multiprocessing start method choice --- src/conductor/client/automator/task_handler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index 6a8255bd0..74dc62731 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -33,6 +33,7 @@ ) _decorated_functions = {} +_VALID_MP_START_METHODS = {"spawn", "fork", "forkserver"} _mp_fork_set = False if not _mp_fork_set: try: @@ -42,7 +43,13 @@ # deadlock. Default is fork for backward compatibility (spawn requires # all Process arguments to be picklable). _default_method = "spawn" if platform == "win32" else "fork" - _method = os.environ.get("CONDUCTOR_MP_START_METHOD", _default_method).strip().lower() + _method = os.environ.get("CONDUCTOR_MP_START_METHOD", "").strip().lower() or _default_method + if _method not in _VALID_MP_START_METHODS: + logger.warning( + "Ignoring invalid CONDUCTOR_MP_START_METHOD=%r; falling back to %r", + _method, _default_method, + ) + _method = _default_method set_start_method(_method) _mp_fork_set = True except Exception as e: From 98cf8ace68f1cbf0d81d3cc8bd1d4e84bcd1008f Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 12:32:13 -0600 Subject: [PATCH 03/24] remove unnecessary sleep --- src/conductor/client/telemetry/metrics_collector_base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index edb72d6a5..a539a25d0 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -149,8 +149,6 @@ def provide_metrics(settings: MetricsSettings) -> None: settings.file_name ) - time.sleep(0.5) - registry = CollectorRegistry() from prometheus_client.multiprocess import MultiProcessCollector as MPCollector from prometheus_client.samples import Sample From 6fed50502e878ca1c6c6df56a014624b1e1dce4a Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Thu, 30 Apr 2026 13:07:32 -0600 Subject: [PATCH 04/24] fixes for a few canonical metrics --- .../telemetry/canonical_metrics_collector.py | 12 ++++ .../telemetry/legacy_metrics_collector.py | 7 +++ .../telemetry/metrics_collector_base.py | 11 ++++ .../test_canonical_metrics_collector.py | 57 +++++++++++++++++++ .../unit/telemetry/test_metrics_collector.py | 39 +++++++++++++ 5 files changed, 126 insertions(+) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index f77a8890e..eaf876085 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -145,6 +145,18 @@ def increment_workflow_start_error(self, workflow_type: str, exception) -> None: }, ) + # ------------------------------------------------------------------ + # Gauges + # ------------------------------------------------------------------ + + def set_active_workers(self, task_type: str, count: int) -> None: + self._record_gauge( + name=MetricName.ACTIVE_WORKERS, + documentation=MetricDocumentation.ACTIVE_WORKERS, + labels={MetricLabel.TASK_TYPE: task_type}, + value=count, + ) + # ------------------------------------------------------------------ # Timing (real Prometheus Histograms) # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index e819810b9..114f2a26d 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -129,6 +129,13 @@ def increment_workflow_start_error(self, workflow_type: str, exception) -> None: }, ) + # ------------------------------------------------------------------ + # Gauges (canonical-only, no-op in legacy) + # ------------------------------------------------------------------ + + def set_active_workers(self, task_type: str, count: int) -> None: + pass + # ------------------------------------------------------------------ # Timing (last-value gauges + quantile gauges) # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index a539a25d0..c81b26f67 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -1,4 +1,5 @@ import abc +import collections import logging import os import signal @@ -106,6 +107,7 @@ def __init__(self, settings: MetricsSettings): self.registry = None self.must_collect_metrics = False self._lock = threading.RLock() + self._active_worker_counts: Dict[str, int] = collections.defaultdict(int) if settings is None: return @@ -418,6 +420,9 @@ def record_task_result_payload_size(self, task_type: str, payload_size: int) -> @abc.abstractmethod def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: ... + @abc.abstractmethod + def set_active_workers(self, task_type: str, count: int) -> None: ... + # ========================================================================= # Concrete event handlers -- delegate to the abstract metric methods. # These satisfy the event listener protocols in event/listeners.py. @@ -436,16 +441,22 @@ def on_poll_failure(self, event: PollFailure) -> None: def on_task_execution_started(self, event: TaskExecutionStarted) -> None: self.increment_task_execution_started(event.task_type) + self._active_worker_counts[event.task_type] += 1 + self.set_active_workers(event.task_type, self._active_worker_counts[event.task_type]) def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") if event.output_size_bytes is not None: self.record_task_result_payload_size(event.task_type, event.output_size_bytes) + self._active_worker_counts[event.task_type] = max(0, self._active_worker_counts[event.task_type] - 1) + self.set_active_workers(event.task_type, self._active_worker_counts[event.task_type]) def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: self.increment_task_execution_error(event.task_type, event.cause) if hasattr(event, 'duration_ms') and event.duration_ms is not None: self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="FAILURE") + self._active_worker_counts[event.task_type] = max(0, self._active_worker_counts[event.task_type] - 1) + self.set_active_workers(event.task_type, self._active_worker_counts[event.task_type]) def on_workflow_started(self, event: WorkflowStarted) -> None: if not event.success and event.cause is not None: diff --git a/tests/unit/telemetry/test_canonical_metrics_collector.py b/tests/unit/telemetry/test_canonical_metrics_collector.py index 16983ae24..3b0ea4e73 100644 --- a/tests/unit/telemetry/test_canonical_metrics_collector.py +++ b/tests/unit/telemetry/test_canonical_metrics_collector.py @@ -195,6 +195,63 @@ def test_event_execution_failure_records_counter_and_histogram(self): self.assertIn("task_execute_error_total", text) self.assertIn("task_execute_time_seconds_bucket", text) + # ------------------------------------------------------------------ + # active_workers gauge + # ------------------------------------------------------------------ + + def test_active_workers_gauge_increments_on_start(self): + event = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(event) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"}', text) + self.assertIn("1.0", text) + + def test_active_workers_gauge_decrements_on_complete(self): + start = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(start) + self.collector.on_task_execution_started(start) + + complete = TaskExecutionCompleted( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", duration_ms=100.0, output_size_bytes=None, + ) + self.collector.on_task_execution_completed(complete) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"} 1.0', text) + + def test_active_workers_gauge_decrements_on_failure(self): + start = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(start) + + failure = TaskExecutionFailure( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", cause=ValueError("x"), duration_ms=50.0, + ) + self.collector.on_task_execution_failure(failure) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"} 0.0', text) + + def test_active_workers_gauge_floors_at_zero(self): + complete = TaskExecutionCompleted( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", duration_ms=100.0, output_size_bytes=None, + ) + self.collector.on_task_execution_completed(complete) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"} 0.0', text) + + def test_active_workers_tracks_multiple_task_types(self): + self.collector.on_task_execution_started( + TaskExecutionStarted(task_type="a", task_id="1", worker_id="w", workflow_instance_id="wf")) + self.collector.on_task_execution_started( + TaskExecutionStarted(task_type="a", task_id="2", worker_id="w", workflow_instance_id="wf")) + self.collector.on_task_execution_started( + TaskExecutionStarted(task_type="b", task_id="3", worker_id="w", workflow_instance_id="wf")) + + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="a"} 2.0', text) + self.assertIn('active_workers{taskType="b"} 1.0', text) + class TestExceptionLabel(unittest.TestCase): """Test the _exception_label helper.""" diff --git a/tests/unit/telemetry/test_metrics_collector.py b/tests/unit/telemetry/test_metrics_collector.py index 5471b745a..3d1ee834d 100644 --- a/tests/unit/telemetry/test_metrics_collector.py +++ b/tests/unit/telemetry/test_metrics_collector.py @@ -500,6 +500,45 @@ def test_quantile_sliding_window(self): # Note: _calculate_percentile is not a public method and percentile calculation # is handled internally by prometheus_client Summary objects + # ========================================================================= + # active_workers tracking (base class counter, legacy no-op gauge) + # ========================================================================= + + def test_active_workers_count_tracks_in_base_class(self): + collector = MetricsCollector(self.metrics_settings) + + start = TaskExecutionStarted(task_type='test_task', task_id='t1', worker_id='w', workflow_instance_id='wf') + collector.on_task_execution_started(start) + collector.on_task_execution_started(start) + self.assertEqual(collector._active_worker_counts['test_task'], 2) + + complete = TaskExecutionCompleted( + task_type='test_task', task_id='t1', worker_id='w', + workflow_instance_id='wf', duration_ms=100.0, output_size_bytes=None, + ) + collector.on_task_execution_completed(complete) + self.assertEqual(collector._active_worker_counts['test_task'], 1) + + def test_active_workers_count_floors_at_zero(self): + collector = MetricsCollector(self.metrics_settings) + + complete = TaskExecutionCompleted( + task_type='test_task', task_id='t1', worker_id='w', + workflow_instance_id='wf', duration_ms=100.0, output_size_bytes=None, + ) + collector.on_task_execution_completed(complete) + self.assertEqual(collector._active_worker_counts['test_task'], 0) + + def test_active_workers_not_emitted_in_legacy_mode(self): + collector = MetricsCollector(self.metrics_settings) + + start = TaskExecutionStarted(task_type='test_task', task_id='t1', worker_id='w', workflow_instance_id='wf') + collector.on_task_execution_started(start) + + self._write_metrics(collector) + metrics_content = self._read_metrics_file() + self.assertNotIn('active_workers', metrics_content) + # ========================================================================= # Edge Cases and Boundary Conditions # ========================================================================= From 63974df9c27d02227ac001215ea60b3435f7b5c6 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 09:36:57 -0600 Subject: [PATCH 05/24] cleaner reporting on which metrics implementation is used in the test harness worker --- harness/main.py | 5 +---- .../telemetry/canonical_metrics_collector.py | 3 +++ .../client/telemetry/legacy_metrics_collector.py | 3 +++ .../client/telemetry/metrics_collector_base.py | 5 +++++ tests/unit/telemetry/test_metrics_factory.py | 14 ++++++++++++++ 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/harness/main.py b/harness/main.py index 2be7d8e6c..f556fabed 100644 --- a/harness/main.py +++ b/harness/main.py @@ -71,12 +71,9 @@ def main() -> None: metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) metrics_settings = MetricsSettings(http_port=metrics_port) - print(f"Prometheus metrics will be served on port {metrics_port}") - # Main-process MetricsCollector: writes workflow-client / HTTP metrics into - # the shared PROMETHEUS_MULTIPROC_DIR so they merge with worker subprocess - # metrics in the exported /metrics payload. metrics_collector = create_metrics_collector(metrics_settings) + print(f"Prometheus metrics server started on port {metrics_port} ({metrics_collector.collector_name()} metrics)") clients = OrkesClients(configuration, metrics_collector=metrics_collector) register_metadata(clients) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index eaf876085..42562a8bb 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -29,6 +29,9 @@ class CanonicalMetricsCollector(MetricsCollectorBase): def __init__(self, settings: MetricsSettings): super().__init__(settings) + def collector_name(self) -> str: + return "canonical" + # ------------------------------------------------------------------ # Counters # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index 114f2a26d..418e56da0 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -26,6 +26,9 @@ def __init__(self, settings: MetricsSettings): self.quantile_metrics: Dict[str, Any] = {} self.quantile_data: Dict[str, deque] = {} + def collector_name(self) -> str: + return "legacy" + # ------------------------------------------------------------------ # Counters # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index c81b26f67..358012d8b 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -363,6 +363,11 @@ def _get_histogram(self, name, documentation, labelnames, buckets=None): # Subclasses implement each with real logic or pass (no-op). # ========================================================================= + @abc.abstractmethod + def collector_name(self) -> str: + """Return the name of this collector implementation ('legacy', 'canonical').""" + ... + @abc.abstractmethod def increment_task_poll(self, task_type: str) -> None: ... diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py index 586047aa5..ebe17864c 100644 --- a/tests/unit/telemetry/test_metrics_factory.py +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -61,6 +61,19 @@ def test_canonical_takes_priority_over_legacy(self): collector = create_metrics_collector(self.settings) self.assertIsInstance(collector, CanonicalMetricsCollector) + def test_legacy_collector_name(self): + """LegacyMetricsCollector.collector_name() returns 'legacy'.""" + collector = create_metrics_collector(self.settings) + self.assertEqual(collector.collector_name(), "legacy") + + def test_canonical_collector_name(self): + """CanonicalMetricsCollector.collector_name() returns 'canonical'.""" + os.environ["WORKER_CANONICAL_METRICS"] = "true" + collector = create_metrics_collector( + MetricsSettings(directory=tempfile.mkdtemp()) + ) + self.assertEqual(collector.collector_name(), "canonical") + def test_both_implementations_satisfy_same_interface(self): """Both implementations have the same public method surface.""" legacy = LegacyMetricsCollector(self.settings) @@ -70,6 +83,7 @@ def test_both_implementations_satisfy_same_interface(self): ) required_methods = [ + "collector_name", "increment_task_poll", "increment_task_poll_error", "increment_task_execution_started", From 9bca75401a43e9f9977496ea554f155c8d6aba2b Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 11:40:58 -0600 Subject: [PATCH 06/24] update docs regarding metrics --- METRICS.md | 527 ++++++++---------- README.md | 2 +- WORKER_CONFIGURATION.md | 7 +- docs/design/WORKER_DESIGN.md | 115 +--- .../design/WORKER_SDK_IMPLEMENTATION_GUIDE.md | 43 +- .../design/event_driven_interceptor_system.md | 403 ++------------ 6 files changed, 313 insertions(+), 784 deletions(-) diff --git a/METRICS.md b/METRICS.md index 2201d1c01..28266498d 100644 --- a/METRICS.md +++ b/METRICS.md @@ -1,333 +1,280 @@ -# Metrics Documentation - -The Conductor Python SDK includes built-in metrics collection using Prometheus to monitor worker performance, API requests, and task execution. - -## Table of Contents - -- [Quick Reference](#quick-reference) -- [Configuration](#configuration) -- [Metric Types](#metric-types) -- [Examples](#examples) - -## Quick Reference - -| Metric Name | Type | Labels | Description | -|------------|------|--------|-------------| -| `api_request_time_seconds` | Timer (quantile gauge) | `method`, `uri`, `status`, `quantile` | API request latency to Conductor server | -| `api_request_time_seconds_count` | Gauge | `method`, `uri`, `status` | Total number of API requests | -| `api_request_time_seconds_sum` | Gauge | `method`, `uri`, `status` | Total time spent in API requests | -| `task_poll_total` | Counter | `taskType` | Number of task poll attempts | -| `task_poll_time` | Gauge | `taskType` | Most recent poll duration (legacy) | -| `task_poll_time_seconds` | Timer (quantile gauge) | `taskType`, `status`, `quantile` | Task poll latency distribution | -| `task_poll_time_seconds_count` | Gauge | `taskType`, `status` | Total number of poll attempts by status | -| `task_poll_time_seconds_sum` | Gauge | `taskType`, `status` | Total time spent polling | -| `task_execute_time` | Gauge | `taskType` | Most recent execution duration (legacy) | -| `task_execute_time_seconds` | Timer (quantile gauge) | `taskType`, `status`, `quantile` | Task execution latency distribution | -| `task_execute_time_seconds_count` | Gauge | `taskType`, `status` | Total number of task executions by status | -| `task_execute_time_seconds_sum` | Gauge | `taskType`, `status` | Total time spent executing tasks | -| `task_execute_error_total` | Counter | `taskType`, `exception` | Number of task execution errors | -| `task_update_time_seconds` | Timer (quantile gauge) | `taskType`, `status`, `quantile` | Task update latency distribution | -| `task_update_time_seconds_count` | Gauge | `taskType`, `status` | Total number of task updates by status | -| `task_update_time_seconds_sum` | Gauge | `taskType`, `status` | Total time spent updating tasks | -| `task_update_error_total` | Counter | `taskType`, `exception` | Number of task update errors | -| `task_result_size` | Gauge | `taskType` | Size of task result payload (bytes) | -| `task_execution_queue_full_total` | Counter | `taskType` | Number of times execution queue was full | -| `task_paused_total` | Counter | `taskType` | Number of polls while worker paused | -| `worker_restart_total` | Counter | `taskType` | Number of times TaskHandler restarted a worker subprocess | -| `external_payload_used_total` | Counter | `taskType`, `payloadType` | External payload storage usage count | -| `workflow_input_size` | Gauge | `workflowType`, `version` | Workflow input payload size (bytes) | -| `workflow_start_error_total` | Counter | `workflowType`, `exception` | Workflow start error count | - -### Label Values - -**`status`**: `SUCCESS`, `FAILURE` -**`method`**: `GET`, `POST`, `PUT`, `DELETE` -**`uri`**: API endpoint path (e.g., `/tasks/poll/batch/{taskType}`, `/tasks/update-v2`) -**`status` (HTTP)**: HTTP response code (`200`, `401`, `404`, `500`) or `error` -**`quantile`**: `0.5` (p50), `0.75` (p75), `0.9` (p90), `0.95` (p95), `0.99` (p99) -**`payloadType`**: `input`, `output` -**`exception`**: Exception type or error message - -### Example Metrics Output +# Python SDK Metrics -```prometheus -# API Request Metrics -api_request_time_seconds{method="GET",uri="/tasks/poll/batch/myTask",status="200",quantile="0.5"} 0.112 -api_request_time_seconds{method="GET",uri="/tasks/poll/batch/myTask",status="200",quantile="0.99"} 0.245 -api_request_time_seconds_count{method="GET",uri="/tasks/poll/batch/myTask",status="200"} 1000.0 -api_request_time_seconds_sum{method="GET",uri="/tasks/poll/batch/myTask",status="200"} 114.5 - -# Task Poll Metrics -task_poll_total{taskType="myTask"} 10264.0 -task_poll_time_seconds{taskType="myTask",status="SUCCESS",quantile="0.95"} 0.025 -task_poll_time_seconds_count{taskType="myTask",status="SUCCESS"} 1000.0 -task_poll_time_seconds_count{taskType="myTask",status="FAILURE"} 95.0 - -# Task Execution Metrics -task_execute_time_seconds{taskType="myTask",status="SUCCESS",quantile="0.99"} 0.017 -task_execute_time_seconds_count{taskType="myTask",status="SUCCESS"} 120.0 -task_execute_error_total{taskType="myTask",exception="TimeoutError"} 3.0 - -# Task Update Metrics -task_update_time_seconds{taskType="myTask",status="SUCCESS",quantile="0.95"} 0.096 -task_update_time_seconds_count{taskType="myTask",status="SUCCESS"} 15.0 -``` +The Conductor Python SDK can expose Prometheus metrics for worker polling, task +execution, task updates, workflow starts, external payload usage, and generated +API-client HTTP calls. -## Configuration +The SDK currently has two mutually exclusive metric surfaces: -### Enabling Metrics +- **Legacy metrics** are the default. They preserve the pre-harmonization Python + SDK names and shapes, including sliding-window quantile gauges for timing + metrics. +- **Canonical metrics** are opt-in with `WORKER_CANONICAL_METRICS=true`. They + use the cross-SDK canonical names, labels, units, and Prometheus histogram + shapes. -Metrics are enabled by providing a `MetricsSettings` object when creating a `TaskHandler`: +Only one collector is active in a worker process. The SDK does not emit legacy +and canonical metrics at the same time. -```python -from conductor.client.configuration.configuration import Configuration -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.automator.task_handler import TaskHandler - -# Configure metrics -metrics_settings = MetricsSettings( - directory='/path/to/metrics', # Directory where metrics file will be written - file_name='conductor_metrics.prom', # Metrics file name (default: 'conductor_metrics.prom') - update_interval=10 # Update interval in seconds (default: 10) -) +Metric names below are the names exposed in Prometheus text output. The Python +`prometheus_client` library appends `_total` to counters in exposition output. -# Configure Conductor connection -api_config = Configuration( - server_api_url='http://localhost:8080/api', - debug=False -) - -# Create task handler with metrics -with TaskHandler( - configuration=api_config, - metrics_settings=metrics_settings, - workers=[...] -) as task_handler: - task_handler.start_processes() -``` - -### AsyncIO Workers +## Configuration -Usage with TaskHandler: +Enable metrics by passing `MetricsSettings` to `TaskHandler`. ```python from conductor.client.automator.task_handler import TaskHandler +from conductor.client.configuration.configuration import Configuration +from conductor.client.configuration.settings.metrics_settings import MetricsSettings + +config = Configuration() +metrics = MetricsSettings( + directory="/tmp/conductor-metrics", + http_port=8000, +) with TaskHandler( - configuration=api_config, - metrics_settings=metrics_settings, + configuration=config, + metrics_settings=metrics, scan_for_annotated_workers=True, - import_modules=['your_module'] ) as task_handler: task_handler.start_processes() task_handler.join_processes() ``` -### Metrics File Cleanup +With `http_port` set, the SDK starts a Prometheus-compatible HTTP endpoint: -For multiprocess workers using Prometheus multiprocess mode, clean the metrics directory on startup to avoid stale data: +```shell +curl http://localhost:8000/metrics +curl http://localhost:8000/health +``` + +Without `http_port`, the SDK writes Prometheus text output to +`{directory}/{file_name}` at `update_interval` seconds: ```python -import os -import shutil - -metrics_dir = '/path/to/metrics' -if os.path.exists(metrics_dir): - shutil.rmtree(metrics_dir) -os.makedirs(metrics_dir, exist_ok=True) - -metrics_settings = MetricsSettings( - directory=metrics_dir, - file_name='conductor_metrics.prom', - update_interval=10 +metrics = MetricsSettings( + directory="/tmp/conductor-metrics", + file_name="conductor_metrics.prom", + update_interval=10, + http_port=None, ) ``` +`MetricsSettings` cleans stale Prometheus multiprocess `.db` files by default. +Use a dedicated metrics directory per worker process group. -## Metric Types - -### Quantile Gauges (Timers) +## Selecting Canonical Metrics -All timing metrics use quantile gauges to track latency distribution: +Set `WORKER_CANONICAL_METRICS` before the worker starts: -- **Quantile labels**: Each metric includes 5 quantiles (p50, p75, p90, p95, p99) -- **Count suffix**: `{metric_name}_count` tracks total number of observations -- **Sum suffix**: `{metric_name}_sum` tracks total time spent - -**Example calculation (average):** -``` -average = task_poll_time_seconds_sum / task_poll_time_seconds_count -average = 18.75 / 1000.0 = 0.01875 seconds +```shell +WORKER_CANONICAL_METRICS=true python my_worker.py ``` -**Why quantiles instead of histograms?** -- More accurate percentile tracking with sliding window (last 1000 observations) -- No need to pre-configure bucket boundaries -- Lower memory footprint -- Direct percentile values without interpolation +Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other +value, or an unset variable, selects legacy metrics. The variable is read when +the metrics collector is created, so changing it requires a worker restart. + +## Canonical Metrics + +Canonical timing values are seconds. Canonical size values are bytes. Label +names use camelCase. + +Metrics are created lazily. A metric appears in `/metrics` only after the +corresponding worker event or collector method records it. Some low-level +surface metrics, such as ack, queue-full, paused, and uncaught-exception +counters, may not appear in normal worker runs unless that path is exercised. + +### Canonical Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time the worker issues a poll request. | +| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. | +| `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. | +| `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. | +| `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. | +| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. | +| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. | +| `thread_uncaught_exceptions_total` | `exception` | Collector surface for uncaught worker-thread exceptions. | +| `worker_restart_total` | `taskType` | Python-only counter for TaskHandler subprocess restarts. | +| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used. | +| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | + +### Canonical Time Histograms + +All canonical time histograms use buckets: +`0.001`, `0.005`, `0.01`, `0.025`, `0.05`, `0.1`, `0.25`, `0.5`, `1`, `2.5`, +`5`, `10`. + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution duration. `status` is `SUCCESS` or `FAILURE`. | +| `task_update_time_seconds` | `taskType`, `status` | Task-result update latency. `status` is `SUCCESS` or `FAILURE`. | +| `http_api_client_request_seconds` | `method`, `uri`, `status` | Generated API-client HTTP request latency. | + +Each histogram exposes Prometheus series such as: -### Sliding Window - -Quantile metrics use a sliding window of the last 1000 observations to calculate percentiles. This provides: -- Recent performance data (not cumulative) -- Accurate percentile estimation -- Bounded memory usage - -## Examples - -### Querying Metrics with PromQL - -**Average API request latency:** -```promql -rate(api_request_time_seconds_sum[5m]) / rate(api_request_time_seconds_count[5m]) +```prometheus +task_execute_time_seconds_bucket{taskType="my_task",status="SUCCESS",le="0.1"} 42.0 +task_execute_time_seconds_count{taskType="my_task",status="SUCCESS"} 50.0 +task_execute_time_seconds_sum{taskType="my_task",status="SUCCESS"} 2.3 ``` -**API error rate:** -```promql -sum(rate(api_request_time_seconds_count{status=~"4..|5.."}[5m])) -/ -sum(rate(api_request_time_seconds_count[5m])) -``` +### Canonical Size Histograms + +All canonical size histograms use buckets: +`100`, `1000`, `10000`, `100000`, `1000000`, `10000000`. + +| Metric | Labels | Description | +|---|---|---| +| `task_result_size_bytes` | `taskType` | Serialized task result output size. | +| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size. | + +### Canonical Gauges + +| Metric | Labels | Description | +|---|---|---| +| `active_workers` | `taskType` | Current number of workers actively executing a task. | + +## Legacy Metrics + +Legacy mode is the default so existing dashboards and alerts continue to work. +Timing metrics are sliding-window quantile gauges over the latest 1,000 +observations. Legacy timing metrics also expose `_count` and `_sum` gauge +series for the current sliding window. + +As in canonical mode, metrics are created lazily and rare or surface-only +counters appear only when the corresponding code path records them. + +### Legacy Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time polling is done. | +| `task_execute_error_total` | `taskType`, `exception` | Task execution errors. `exception` is `str(exception)`. | +| `task_update_error_total` | `taskType`, `exception` | Task update errors. `exception` is `str(exception)`. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. `exception` is `str(exception)`. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. | +| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. | +| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. | +| `thread_uncaught_exceptions_total` | none | Collector surface for uncaught worker-thread exceptions. | +| `worker_restart_total` | `taskType` | TaskHandler subprocess restarts. | +| `external_payload_used_total` | `entityName`, `operation`, `payload_type` | External payload storage usage. | +| `workflow_start_error_total` | `workflowType`, `exception` | Workflow start errors. `exception` is `str(exception)`. | + +Legacy mode does not emit `task_poll_error_total`, +`task_execution_started_total`, or `active_workers`. + +### Legacy Time Metrics + +| Metric | Type | Labels | Description | +|---|---|---|---| +| `task_poll_time` | Gauge | `taskType` | Most recent poll duration, in seconds. | +| `task_poll_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window poll latency quantiles. | +| `task_poll_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window poll observation count. | +| `task_poll_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window poll duration sum. | +| `task_execute_time` | Gauge | `taskType` | Most recent task execution duration, in seconds. | +| `task_execute_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window execution latency quantiles. | +| `task_execute_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window execution observation count. | +| `task_execute_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window execution duration sum. | +| `task_update_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window task update latency quantiles. | +| `task_update_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window task update observation count. | +| `task_update_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window task update duration sum. | +| `http_api_client_request` | Quantile gauge | `method`, `uri`, `status`, `quantile` | Sliding-window API-client request latency quantiles. | +| `http_api_client_request_count` | Gauge | `method`, `uri`, `status` | Sliding-window API-client request observation count. | +| `http_api_client_request_sum` | Gauge | `method`, `uri`, `status` | Sliding-window API-client request duration sum. | + +### Legacy Size Gauges + +| Metric | Labels | Description | +|---|---|---| +| `task_result_size` | `taskType` | Most recent serialized task result output size, in bytes. | +| `workflow_input_size` | `workflowType`, `version` | Most recent serialized workflow input size, in bytes. | + +## Labels + +| Label | Used by | Values | +|---|---|---| +| `taskType` | Worker metrics | Task definition name. | +| `workflowType` | Workflow metrics | Workflow definition name. | +| `version` | `workflow_input_size`, `workflow_input_size_bytes` | Workflow version as a string. If absent, the SDK uses `1`. | +| `status` | Task time metrics | `SUCCESS` or `FAILURE`. For HTTP metrics, the response code as a string, an exception `status` or `code`, or `error` when unavailable. | +| `exception` | Error counters | Legacy uses `str(exception)`. Canonical uses the exception class name, such as `TimeoutError`. | +| `entityName` | `external_payload_used_total` | Task type or workflow name associated with the external payload. | +| `operation` | `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. | +| `payload_type` | Legacy `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `method` | HTTP metrics | HTTP verb. | +| `uri` | HTTP metrics | Request path passed by the generated API client. | +| `quantile` | Legacy time metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. | + +## Migrating From Legacy to Canonical + +Canonical mode is opt-in during the deprecation period. Before switching a +production worker, update dashboards and alerts against a staging worker with +`WORKER_CANONICAL_METRICS=true`. + +Key changes: + +- Time and size distribution metrics are real Prometheus histograms in + canonical mode. Query `_bucket` series with `histogram_quantile()` instead of + reading `{quantile="..."}` gauges. +- Legacy last-value gauges `task_poll_time`, `task_execute_time`, + `task_result_size`, and `workflow_input_size` are not emitted in canonical + mode. +- Canonical adds `task_poll_error_total`, `task_execution_started_total`, and + `active_workers`. +- `external_payload_used_total` changes label `payload_type` to `payloadType`. +- Canonical `exception` labels are bounded to exception class names. Legacy + error counters may include raw exception messages. + +Common replacements: + +| Legacy | Canonical | +|---|---| +| `task_poll_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` | +| `task_execute_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` | +| `task_update_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_update_time_seconds_bucket[5m])))` | +| `http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` | +| `task_result_size` | `task_result_size_bytes_bucket`, `_count`, and `_sum` | +| `workflow_input_size` | `workflow_input_size_bytes_bucket`, `_count`, and `_sum` | +| `external_payload_used_total{payload_type="TASK_OUTPUT"}` | `external_payload_used_total{payloadType="TASK_OUTPUT"}` | + +Average latency queries continue to use `_sum` divided by `_count`, but the +canonical series are cumulative histogram counters: -**Task poll success rate:** ```promql -sum(rate(task_poll_time_seconds_count{status="SUCCESS"}[5m])) +sum(rate(task_execute_time_seconds_sum[5m])) by (taskType) / -sum(rate(task_poll_time_seconds_count[5m])) -``` - -**p95 task execution time:** -```promql -task_execute_time_seconds{quantile="0.95"} -``` - -**Slowest API endpoints (p99):** -```promql -topk(10, api_request_time_seconds{quantile="0.99"}) -``` - -### Complete Example - -```python -import os -import shutil -from conductor.client.configuration.configuration import Configuration -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.automator.task_handler import TaskHandler -from conductor.client.worker.worker_interface import WorkerInterface - -# Clean metrics directory -metrics_dir = os.path.join(os.path.expanduser('~'), 'conductor_metrics') -if os.path.exists(metrics_dir): - shutil.rmtree(metrics_dir) -os.makedirs(metrics_dir, exist_ok=True) - -# Configure metrics -metrics_settings = MetricsSettings( - directory=metrics_dir, - file_name='conductor_metrics.prom', - update_interval=10 # Update file every 10 seconds -) - -# Configure Conductor -api_config = Configuration( - server_api_url='http://localhost:8080/api', - debug=False -) - -# Define worker -class MyWorker(WorkerInterface): - def execute(self, task): - return {'status': 'completed'} - - def get_task_definition_name(self): - return 'my_task' - -# Start with metrics -with TaskHandler( - configuration=api_config, - metrics_settings=metrics_settings, - workers=[MyWorker()] -) as task_handler: - task_handler.start_processes() +sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ``` -### Scraping with Prometheus - -Configure Prometheus to scrape the metrics file: - -```yaml -# prometheus.yml -scrape_configs: - - job_name: 'conductor-python-sdk' - static_configs: - - targets: ['localhost:8000'] # Use file_sd or custom exporter - metric_relabel_configs: - - source_labels: [taskType] - target_label: task_type -``` - -**Note:** Since metrics are written to a file, you'll need to either: -1. Use Prometheus's `textfile` collector with Node Exporter -2. Create a simple HTTP server to expose the metrics file -3. Use a custom exporter to read and serve the file - -### Example HTTP Metrics Server - -```python -from http.server import HTTPServer, SimpleHTTPRequestHandler -import os - -class MetricsHandler(SimpleHTTPRequestHandler): - def do_GET(self): - if self.path == '/metrics': - metrics_file = '/path/to/conductor_metrics.prom' - if os.path.exists(metrics_file): - with open(metrics_file, 'rb') as f: - content = f.read() - self.send_response(200) - self.send_header('Content-Type', 'text/plain; version=0.0.4') - self.end_headers() - self.wfile.write(content) - else: - self.send_response(404) - self.end_headers() - else: - self.send_response(404) - self.end_headers() - -# Run server -httpd = HTTPServer(('0.0.0.0', 8000), MetricsHandler) -httpd.serve_forever() -``` - -## Best Practices +## Troubleshooting -1. **Clean metrics directory on startup** to avoid stale multiprocess metrics -2. **Monitor disk space** as metrics files can grow with many task types -3. **Use appropriate update_interval** (10-60 seconds recommended) -4. **Set up alerts** on error rates and high latencies -5. **Monitor queue saturation** (`task_execution_queue_full_total`) for backpressure -6. **Track API errors** by status code to identify authentication or server issues -7. **Use p95/p99 latencies** for SLO monitoring rather than averages +### Metrics Are Empty -## Troubleshooting +- Verify that `metrics_settings` is passed to `TaskHandler`. +- Verify workers have polled or executed tasks. Metrics are created lazily when + the relevant event occurs. +- Check that the metrics directory is writable. -### Metrics file is empty -- Ensure `MetricsCollector` is registered as an event listener -- Check that workers are actually polling and executing tasks -- Verify the metrics directory has write permissions +### Stale or Unexpected Series -### Stale metrics after restart -- Clean the metrics directory on startup (see Configuration section) -- Prometheus's `multiprocess` mode requires cleanup between runs +- Use a dedicated `directory` for each worker process group. +- Leave `clean_directory=True` on `MetricsSettings` unless another process owns + the same Prometheus multiprocess directory. +- Restart workers after changing `WORKER_CANONICAL_METRICS`. -### High memory usage -- Reduce the sliding window size (default: 1000 observations) -- Increase `update_interval` to write less frequently -- Limit the number of unique label combinations +### High Cardinality -### Missing metrics -- Verify `metrics_settings` is passed to TaskHandler -- Check that the SDK version supports the metric you're looking for -- Ensure workers are properly registered and running +- Prefer canonical mode for bounded `exception` labels. +- Watch the `uri` label on HTTP metrics. The Python SDK records the request path + available at the generated API-client call site. +- Avoid embedding user identifiers or unbounded values in task type, workflow + type, or external payload labels. diff --git a/README.md b/README.md index 958dfbe35..0ea4e91ec 100644 --- a/README.md +++ b/README.md @@ -320,7 +320,7 @@ with TaskHandler(configuration=config, metrics_settings=metrics, scan_for_annota curl http://localhost:8000/metrics ``` -See [examples/metrics_example.py](examples/metrics_example.py) and [METRICS.md](METRICS.md) for details on all tracked metrics. +Legacy metrics are emitted by default. Set `WORKER_CANONICAL_METRICS=true` before starting workers to use the canonical metric catalog. See [examples/metrics_example.py](examples/metrics_example.py) and [METRICS.md](METRICS.md) for the full legacy and canonical reference. ### Managing Workflow Executions diff --git a/WORKER_CONFIGURATION.md b/WORKER_CONFIGURATION.md index 9d8ab1a01..39bb0bc28 100644 --- a/WORKER_CONFIGURATION.md +++ b/WORKER_CONFIGURATION.md @@ -330,7 +330,6 @@ export conductor.worker.process_order.paused=true When a worker is paused: - It stops polling for new tasks - Already-executing tasks complete normally -- The `task_paused_total` metric is incremented for each skipped poll - No code changes or process restarts required **Use cases:** @@ -346,11 +345,7 @@ unset conductor.worker.all.paused export conductor.worker.all.paused=false ``` -**Monitor paused workers** using the `task_paused_total` metric: -```promql -# Check how many times workers were paused -task_paused_total{taskType="process_order"} -``` +See [METRICS.md](METRICS.md) for the current Python SDK metrics catalog. ### Multi-Region Deployment diff --git a/docs/design/WORKER_DESIGN.md b/docs/design/WORKER_DESIGN.md index 98bcf62ba..fdc0b1af4 100644 --- a/docs/design/WORKER_DESIGN.md +++ b/docs/design/WORKER_DESIGN.md @@ -767,114 +767,13 @@ Workers package - all worker modules auto-discovered ## Metrics & Monitoring -The SDK provides comprehensive Prometheus metrics collection with two deployment modes: +This design document describes how worker events flow through the SDK. The +current user-facing metrics setup, legacy and canonical metric catalogs, +`WORKER_CANONICAL_METRICS` behavior, Prometheus examples, and migration guidance +are maintained in [`../../METRICS.md`](../../METRICS.md). -### Configuration - -**HTTP Mode (Recommended - Metrics served from memory):** -```python -from conductor.client.configuration.settings.metrics_settings import MetricsSettings - -metrics_settings = MetricsSettings( - directory="/tmp/conductor-metrics", # .db files for multiprocess coordination - update_interval=0.1, # Update every 100ms - http_port=8000 # Expose metrics via HTTP -) - -with TaskHandler( - configuration=config, - metrics_settings=metrics_settings -) as handler: - handler.start_processes() -``` - -**File Mode (Metrics written to file):** -```python -metrics_settings = MetricsSettings( - directory="/tmp/conductor-metrics", - file_name="metrics.prom", - update_interval=1.0, - http_port=None # No HTTP server - write to file instead -) -``` - -### Modes - -| Mode | HTTP Server | File Writes | Use Case | -|------|-------------|-------------|----------| -| HTTP (`http_port` set) | ✅ Built-in | ❌ Disabled | Prometheus scraping, production | -| File (`http_port=None`) | ❌ Disabled | ✅ Enabled | File-based monitoring, testing | - -**HTTP Mode Benefits:** -- Metrics served directly from memory (no file I/O) -- Built-in HTTP server with `/metrics` and `/health` endpoints -- Automatic aggregation across worker processes (no PID labels) -- Ready for Prometheus scraping out-of-the-box - -### Key Metrics - -**Task Metrics:** -- `task_poll_time_seconds{taskType,quantile}` - Poll latency (includes batch polling) -- `task_execute_time_seconds{taskType,quantile}` - Actual execution time (async tasks: from submission to completion) -- `task_execute_error_total{taskType,exception}` - Execution errors by type -- `task_poll_total{taskType}` - Total poll count -- `task_result_size_bytes{taskType,quantile}` - Task output size - -**API Metrics:** -- `http_api_client_request{method,uri,status,quantile}` - API request latency -- `http_api_client_request_count{method,uri,status}` - Request count by endpoint -- `http_api_client_request_sum{method,uri,status}` - Total request time - -**Labels:** -- `taskType`: Task definition name -- `method`: HTTP method (GET, POST, PUT) -- `uri`: API endpoint path -- `status`: HTTP status code -- `exception`: Exception type (for errors) -- `quantile`: 0.5, 0.75, 0.9, 0.95, 0.99 - -**Important Notes:** -- **No PID labels**: Metrics are automatically aggregated across processes -- **Async execution time**: Includes actual execution time, not just coroutine submission time -- **Multiprocess safe**: Uses SQLite .db files in `directory` for coordination - -### Prometheus Integration - -**Scrape Config:** -```yaml -scrape_configs: - - job_name: 'conductor-workers' - static_configs: - - targets: ['localhost:8000'] - scrape_interval: 15s -``` - -**Accessing Metrics:** -```bash -# Metrics endpoint -curl http://localhost:8000/metrics - -# Health check -curl http://localhost:8000/health - -# Watch specific metric -watch -n 1 'curl -s http://localhost:8000/metrics | grep task_execute_time_seconds' -``` - -**PromQL Examples:** -```promql -# Average execution time -rate(task_execute_time_seconds_sum[5m]) / rate(task_execute_time_seconds_count[5m]) - -# Success rate -sum(rate(task_execute_time_seconds_count{status="SUCCESS"}[5m])) / sum(rate(task_execute_time_seconds_count[5m])) - -# p95 latency -task_execute_time_seconds{quantile="0.95"} - -# Error rate -sum(rate(task_execute_error_total[5m])) by (taskType) -``` +Keep metric names and PromQL examples out of this design document so the SDK has +one source of truth for legacy and canonical metrics. --- @@ -1264,8 +1163,8 @@ class CostTracker(TaskRunnerEventsListener): ```python handler = TaskHandler( configuration=config, + metrics_settings=metrics_settings, event_listeners=[ - PrometheusMetricsCollector(), SLAMonitor(threshold_ms=5000), CostTracker(cost_per_second={'ml_task': 0.05}), CustomAuditLogger() diff --git a/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md b/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md index 3f28504df..8c2db407a 100644 --- a/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md +++ b/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md @@ -1789,44 +1789,15 @@ Response: void ## 15. Metrics & Monitoring -### 15.1 Required Metrics +The Python SDK's current metrics behavior is documented in +[`../../METRICS.md`](../../METRICS.md). That file is the source of truth for: -**Via Event System (Recommended):** +- Enabling metrics with `MetricsSettings` +- Selecting canonical metrics with `WORKER_CANONICAL_METRICS` +- The complete legacy and canonical Prometheus catalogs +- Migration guidance from legacy quantile gauges to canonical histograms -Implement MetricsCollector as EventListener: - -``` -class MetricsCollector implements TaskRunnerEventsListener { - on_poll_started(event): - increment_counter("task_poll_total", labels={taskType: event.taskType}) - - on_poll_completed(event): - record_histogram("task_poll_time_seconds", event.durationMs / 1000) - increment_counter("task_poll_total", labels={taskType: event.taskType}) - - on_task_execution_completed(event): - record_histogram("task_execute_time_seconds", event.durationMs / 1000) - record_histogram("task_result_size_bytes", event.outputSizeBytes) - - on_task_execution_failure(event): - increment_counter("task_execute_error_total", - labels={taskType: event.taskType, exception: event.cause.type}) - - on_task_update_failure(event): - increment_counter("task_update_failed_total", - labels={taskType: event.taskType}) - // CRITICAL: Alert operations team -} -``` - -**Metric Names (Prometheus format):** -- `task_poll_total{taskType}` -- `task_poll_time_seconds{taskType,quantile}` -- `task_execute_time_seconds{taskType,quantile}` -- `task_execute_error_total{taskType,exception}` -- `task_result_size_bytes{taskType,quantile}` -- `task_update_error_total{taskType,exception}` -- `task_update_failed_total{taskType}` ← CRITICAL metric +Do not duplicate metric names or PromQL examples in this implementation guide. --- diff --git a/docs/design/event_driven_interceptor_system.md b/docs/design/event_driven_interceptor_system.md index 011bdb85d..ce01b08b9 100644 --- a/docs/design/event_driven_interceptor_system.md +++ b/docs/design/event_driven_interceptor_system.md @@ -1,5 +1,11 @@ # Event-Driven Interceptor System - Design Document +> Historical design note: this document describes the event-driven interceptor +> design and includes some pre-harmonization metrics examples. The current +> Python SDK metrics setup, complete legacy and canonical catalogs, +> `WORKER_CANONICAL_METRICS` behavior, and migration guidance are maintained in +> [`../../METRICS.md`](../../METRICS.md). + ## Table of Contents - [Overview](#overview) - [Current State Analysis](#current-state-analysis) @@ -53,28 +59,10 @@ Design and implement an event-driven interceptor system that: ### Existing Metrics System -**Location**: `src/conductor/client/telemetry/metrics_collector.py` - -```python -class MetricsCollector: - def __init__(self, settings: MetricsSettings): - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory - MultiProcessCollector(self.registry) - - def increment_task_poll(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_POLL, - documentation=MetricDocumentation.TASK_POLL, - labels={MetricLabel.TASK_TYPE: task_type} - ) -``` - -**Current Usage** in `task_runner_asyncio.py`: - -```python -if self.metrics_collector is not None: - self.metrics_collector.increment_task_poll(task_definition_name) -``` +The original design coupled task runner code directly to a metrics collector. +The current implementation now routes metrics through telemetry collector +classes and worker/client events. See [`../../METRICS.md`](../../METRICS.md) for +current setup and metric names. ### Problems with Current Approach @@ -86,18 +74,10 @@ if self.metrics_collector is not None: | Limited data | Can't access full context | Medium | | No filtering | All-or-nothing | Low | -### Available Metrics (Current) +### Current Metrics Reference -**Counters:** -- `task_poll`, `task_poll_error`, `task_execution_queue_full` -- `task_execute_error`, `task_ack_error`, `task_ack_failed` -- `task_update_error`, `task_paused` -- `thread_uncaught_exceptions`, `workflow_start_error` -- `external_payload_used` - -**Gauges:** -- `task_poll_time`, `task_execute_time` -- `task_result_size`, `workflow_input_size` +The current Prometheus metric names, labels, and legacy/canonical mode behavior +are maintained in [`../../METRICS.md`](../../METRICS.md). --- @@ -760,13 +740,12 @@ class TaskResultSize(TaskClientEvent): ## Metrics Collection Flow -### Old Flow (Current) +### Earlier Direct Flow ``` TaskRunner.poll_tasks() - └─> metrics_collector.increment_task_poll(task_type) - └─> counter.labels(task_type).inc() - └─> Prometheus registry + └─> direct metrics collector call + └─> Prometheus registry ``` **Problems:** @@ -774,14 +753,13 @@ TaskRunner.poll_tasks() - Synchronous call - Can't add custom logic without modifying SDK -### New Flow (Proposed) +### Event-Driven Flow ``` TaskRunner.poll_tasks() └─> event_dispatcher.publish(PollStarted(...)) └─> asyncio.create_task(dispatch_to_listeners()) - ├─> PrometheusCollector.on_poll_started() - │ └─> counter.labels(task_type).inc() + ├─> Metrics listener ├─> DatadogCollector.on_poll_started() │ └─> datadog.increment('poll.started') └─> CustomListener.on_poll_started() @@ -796,15 +774,7 @@ TaskRunner.poll_tasks() ### Integration with TaskRunnerAsyncIO -**Current code** (`task_runner_asyncio.py`): - -```python -# OLD - Direct metrics call -if self.metrics_collector is not None: - self.metrics_collector.increment_task_poll(task_definition_name) -``` - -**New code** (with events): +**Event publishing:** ```python # NEW - Event publishing @@ -815,211 +785,12 @@ self.event_dispatcher.publish(PollStarted( )) ``` -### Adapter Pattern for Backward Compatibility - -**Location**: `src/conductor/client/telemetry/metrics_collector_adapter.py` - -```python -""" -Adapter to make old MetricsCollector work with new event system. -""" - -from conductor.client.telemetry.metrics_collector import MetricsCollector as OldMetricsCollector -from conductor.client.events.listeners import MetricsCollector as NewMetricsCollector -from conductor.client.events.task_runner_events import * - - -class MetricsCollectorAdapter(NewMetricsCollector): - """ - Adapter that wraps old MetricsCollector and implements new protocol. - - This allows existing metrics collection to work with new event system - without any code changes. - """ - - def __init__(self, old_collector: OldMetricsCollector): - self.collector = old_collector +### Current Metrics Implementation - def on_poll_started(self, event: PollStarted) -> None: - self.collector.increment_task_poll(event.task_type) - - def on_poll_completed(self, event: PollCompleted) -> None: - self.collector.record_task_poll_time(event.task_type, event.duration_ms / 1000.0) - - def on_poll_failure(self, event: PollFailure) -> None: - # Create exception-like object for old API - error = type(event.error_type, (Exception,), {})() - self.collector.increment_task_poll_error(event.task_type, error) - - def on_task_execution_started(self, event: TaskExecutionStarted) -> None: - # Old collector doesn't have this metric - pass - - def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: - self.collector.record_task_execute_time( - event.task_type, - event.duration_ms / 1000.0 - ) - - def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: - error = type(event.error_type, (Exception,), {})() - self.collector.increment_task_execution_error(event.task_type, error) - - # Implement other protocol methods... -``` - -### New Prometheus Collector (Reference Implementation) - -**Location**: `src/conductor/client/telemetry/prometheus/prometheus_metrics_collector.py` - -```python -""" -Reference implementation: Prometheus metrics collector using event system. -""" - -from typing import Optional -from prometheus_client import Counter, Histogram, CollectorRegistry -from conductor.client.events.listeners import MetricsCollector -from conductor.client.events.task_runner_events import * -from conductor.client.events.workflow_events import * -from conductor.client.events.task_client_events import * - - -class PrometheusMetricsCollector(MetricsCollector): - """ - Prometheus metrics collector implementing the MetricsCollector protocol. - - Exposes metrics in Prometheus format for scraping. - - Usage: - collector = PrometheusMetricsCollector() - - # Register with task handler - handler = TaskHandler( - configuration=config, - event_listeners=[collector] - ) - """ - - def __init__( - self, - registry: Optional[CollectorRegistry] = None, - namespace: str = "conductor" - ): - self.registry = registry or CollectorRegistry() - self.namespace = namespace - - # Define metrics - self._poll_started_counter = Counter( - f'{namespace}_task_poll_started_total', - 'Total number of task polling attempts', - ['task_type', 'worker_id'], - registry=self.registry - ) - - self._poll_duration_histogram = Histogram( - f'{namespace}_task_poll_duration_seconds', - 'Task polling duration in seconds', - ['task_type', 'status'], # status: success, failure - registry=self.registry - ) - - self._task_execution_started_counter = Counter( - f'{namespace}_task_execution_started_total', - 'Total number of task executions started', - ['task_type', 'worker_id'], - registry=self.registry - ) - - self._task_execution_duration_histogram = Histogram( - f'{namespace}_task_execution_duration_seconds', - 'Task execution duration in seconds', - ['task_type', 'status'], # status: completed, failed - registry=self.registry - ) - - self._task_execution_failure_counter = Counter( - f'{namespace}_task_execution_failures_total', - 'Total number of task execution failures', - ['task_type', 'error_type', 'retryable'], - registry=self.registry - ) - - self._workflow_started_counter = Counter( - f'{namespace}_workflow_started_total', - 'Total number of workflow start attempts', - ['workflow_name', 'status'], # status: success, failure - registry=self.registry - ) - - # Task Runner Event Handlers - - def on_poll_started(self, event: PollStarted) -> None: - self._poll_started_counter.labels( - task_type=event.task_type, - worker_id=event.worker_id - ).inc() - - def on_poll_completed(self, event: PollCompleted) -> None: - self._poll_duration_histogram.labels( - task_type=event.task_type, - status='success' - ).observe(event.duration_ms / 1000.0) - - def on_poll_failure(self, event: PollFailure) -> None: - self._poll_duration_histogram.labels( - task_type=event.task_type, - status='failure' - ).observe(event.duration_ms / 1000.0) - - def on_task_execution_started(self, event: TaskExecutionStarted) -> None: - self._task_execution_started_counter.labels( - task_type=event.task_type, - worker_id=event.worker_id - ).inc() - - def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: - self._task_execution_duration_histogram.labels( - task_type=event.task_type, - status='completed' - ).observe(event.duration_ms / 1000.0) - - def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: - self._task_execution_duration_histogram.labels( - task_type=event.task_type, - status='failed' - ).observe(event.duration_ms / 1000.0) - - self._task_execution_failure_counter.labels( - task_type=event.task_type, - error_type=event.error_type, - retryable=str(event.is_retryable) - ).inc() - - # Workflow Event Handlers - - def on_workflow_started(self, event: WorkflowStarted) -> None: - self._workflow_started_counter.labels( - workflow_name=event.workflow_name, - status='success' if event.success else 'failure' - ).inc() - - def on_workflow_input_size(self, event: WorkflowInputSize) -> None: - # Could add histogram for input sizes - pass - - def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: - # Could track external storage usage - pass - - # Task Client Event Handlers - - def on_task_payload_used(self, event: TaskPayloadUsed) -> None: - pass - - def on_task_result_size(self, event: TaskResultSize) -> None: - pass -``` +The implemented SDK metrics collector is selected by `MetricsSettings` and, for +canonical mode, `WORKER_CANONICAL_METRICS`. It listens to worker and client +events through the current telemetry collector classes. See +[`../../METRICS.md`](../../METRICS.md) for current setup and metric names. --- @@ -1045,50 +816,45 @@ class PrometheusMetricsCollector(MetricsCollector): **Tasks:** 1. Add event_dispatcher to TaskRunnerAsyncIO 2. Add event_dispatcher to TaskRunner (multiprocessing) -3. Publish events alongside existing metrics calls -4. Create MetricsCollectorAdapter +3. Publish events for worker lifecycle changes +4. Register metrics collectors as event listeners 5. Integration tests -**Backward Compatible**: Both old and new APIs work simultaneously - -```python -# Both work at the same time -if self.metrics_collector: - self.metrics_collector.increment_task_poll(task_type) # OLD - -self.event_dispatcher.publish(PollStarted(...)) # NEW -``` +**Backward Compatible**: Existing metrics setup continues to work while event +publishing is introduced. ### Phase 3: Reference Implementation (Week 3) **Goal**: New Prometheus collector using events **Tasks:** -1. Implement PrometheusMetricsCollector (new) +1. Implement the built-in metrics collector 2. Create example collectors (Datadog, CloudWatch) 3. Documentation and examples 4. Performance benchmarks -**Backward Compatible**: Users can choose old or new collector +**Backward Compatible**: Users can select legacy or canonical metrics through +the documented metrics factory behavior. ### Phase 4: Deprecation (Future Release) -**Goal**: Mark old API as deprecated +**Goal**: Deprecate pre-harmonization metric shapes when canonical metrics +become the default. **Tasks:** -1. Add deprecation warnings to old MetricsCollector -2. Update all examples to use new API -3. Migration guide +1. Announce canonical metrics as the default +2. Update examples to use documented metrics setup +3. Maintain migration guidance in `METRICS.md` **Timeline**: 6 months deprecation period ### Phase 5: Removal (Future Major Version) -**Goal**: Remove old metrics API +**Goal**: Remove legacy metric shapes in a future major version. **Tasks:** -1. Remove old MetricsCollector implementation -2. Remove adapter +1. Remove legacy collector implementation +2. Keep `METRICS.md` aligned with the released surface 3. Update major version **Timeline**: Next major version (2.0.0) @@ -1132,9 +898,9 @@ self.event_dispatcher.publish(PollStarted(...)) # NEW - [ ] Publish events (same as AsyncIO) - [ ] Handle multiprocess event publishing -**Day 4: Adapter Pattern** -- [ ] Implement MetricsCollectorAdapter -- [ ] Tests for adapter +**Day 4: Compatibility** +- [ ] Verify existing metrics setup continues to work +- [ ] Tests for compatibility behavior **Day 5: Integration Tests** - [ ] End-to-end tests with events @@ -1143,8 +909,8 @@ self.event_dispatcher.publish(PollStarted(...)) # NEW ### Week 3: Reference Implementation & Examples -**Day 1-2: New Prometheus Collector** -- [ ] Implement PrometheusMetricsCollector using events +**Day 1-2: Built-in Metrics Collector** +- [ ] Implement metrics collection using events - [ ] HTTP server for metrics endpoint - [ ] Tests @@ -1163,51 +929,29 @@ self.event_dispatcher.publish(PollStarted(...)) # NEW ## Examples -### Example 1: Basic Usage (Prometheus) +### Example 1: Current Metrics Usage ```python from conductor.client.configuration.configuration import Configuration from conductor.client.automator.task_handler import TaskHandler -from conductor.client.telemetry.prometheus.prometheus_metrics_collector import ( - PrometheusMetricsCollector -) +from conductor.client.configuration.settings.metrics_settings import MetricsSettings config = Configuration() - -# Create Prometheus collector -prometheus = PrometheusMetricsCollector() +metrics_settings = MetricsSettings(directory="/tmp/conductor-metrics", http_port=8000) # Create task handler with metrics with TaskHandler( configuration=config, - event_listeners=[prometheus] # NEW API + metrics_settings=metrics_settings, ) as handler: handler.start_processes() handler.join_processes() ``` -### Example 2: Multiple Collectors - -```python -from conductor.client.telemetry.prometheus.prometheus_metrics_collector import ( - PrometheusMetricsCollector -) -from my_app.metrics.datadog_collector import DatadogCollector -from my_app.monitoring.sla_monitor import SLAMonitor - -# Create multiple collectors -prometheus = PrometheusMetricsCollector() -datadog = DatadogCollector(api_key=os.getenv('DATADOG_API_KEY')) -sla_monitor = SLAMonitor(thresholds={'critical_task': 30.0}) - -# Register all collectors -handler = TaskHandler( - configuration=config, - event_listeners=[prometheus, datadog, sla_monitor] -) -``` +For the current Prometheus metrics catalog and canonical mode selection, see +[`../../METRICS.md`](../../METRICS.md). -### Example 3: Custom Event Listener +### Example 2: Custom Event Listener ```python from conductor.client.events.listeners import TaskRunnerEventsListener @@ -1240,7 +984,7 @@ handler = TaskHandler( ) ``` -### Example 4: Selective Listening (Lambda) +### Example 3: Selective Listening (Lambda) ```python from conductor.client.events.event_dispatcher import EventDispatcher @@ -1259,7 +1003,7 @@ dispatcher.register_sync( ) ``` -### Example 5: Cost Tracking +### Example 4: Cost Tracking ```python from decimal import Decimal @@ -1295,27 +1039,6 @@ handler = TaskHandler( ) ``` -### Example 6: Backward Compatibility - -```python -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.telemetry.metrics_collector import MetricsCollector -from conductor.client.telemetry.metrics_collector_adapter import MetricsCollectorAdapter - -# OLD API (still works) -metrics_settings = MetricsSettings(directory="/tmp/metrics") -old_collector = MetricsCollector(metrics_settings) - -# Wrap old collector with adapter -adapter = MetricsCollectorAdapter(old_collector) - -# Use with new event system -handler = TaskHandler( - configuration=config, - event_listeners=[adapter] # OLD collector works with NEW system! -) -``` - --- ## Performance Considerations @@ -1482,16 +1205,13 @@ prometheus.start_http_server(port=9991, path='/metrics') ## Appendix A: API Comparison -### Old API (Current) +### Earlier Direct Metrics API -```python -# Direct coupling to metrics collector -if self.metrics_collector: - self.metrics_collector.increment_task_poll(task_type) - self.metrics_collector.record_task_poll_time(task_type, duration) -``` +The earlier design coupled task runner code directly to metric recording calls. +Current user-facing metrics setup is documented in +[`../../METRICS.md`](../../METRICS.md). -### New API (Proposed) +### Event-Driven API ```python # Event-driven, decoupled @@ -1520,11 +1240,8 @@ src/conductor/client/ │ └── task_client_events.py # Task client event types │ ├── telemetry/ -│ ├── metrics_collector.py # OLD (keep for compatibility) -│ ├── metrics_collector_adapter.py # Adapter for old → new -│ └── prometheus/ -│ ├── __init__.py -│ └── prometheus_metrics_collector.py # NEW reference implementation +│ ├── metrics_collector.py # Compatibility shim +│ └── metrics_collector_base.py # Shared collector infrastructure │ └── automator/ ├── task_handler_asyncio.py # Modified to publish events From a6e721d17c74bec348d32ecb30168088c0f1cd28 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 11:53:13 -0600 Subject: [PATCH 07/24] additional update of docs re: metrics --- METRICS.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/METRICS.md b/METRICS.md index 28266498d..24bc39ac3 100644 --- a/METRICS.md +++ b/METRICS.md @@ -234,7 +234,18 @@ Key changes: - Canonical `exception` labels are bounded to exception class names. Legacy error counters may include raw exception messages. -Common replacements: +Legacy metrics that change name or type in canonical mode: + +| Legacy metric | Canonical metric | Change | +|---|---|---| +| `task_poll_time` (gauge) | — | Removed; use the histogram instead. | +| `task_execute_time` (gauge) | — | Removed; use the histogram instead. | +| `task_result_size` (gauge) | `task_result_size_bytes` (histogram) | Renamed; gauge becomes histogram with buckets. | +| `workflow_input_size` (gauge) | `workflow_input_size_bytes` (histogram) | Renamed; gauge becomes histogram with buckets. | +| `http_api_client_request` (quantile gauge) | `http_api_client_request_seconds` (histogram) | Renamed with `_seconds` suffix; quantile gauge becomes histogram. | +| `external_payload_used_total{payload_type=…}` | `external_payload_used_total{payloadType=…}` | Label renamed from `payload_type` to `payloadType`. | + +Common PromQL replacements: | Legacy | Canonical | |---|---| From 98255f87c84b3243cdcc9e7bf843c0feb2e8691e Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Thu, 7 May 2026 11:14:57 -0600 Subject: [PATCH 08/24] add or update a changelog --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..3a7bb8067 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,29 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` + - New `CanonicalMetricsCollector` emits the harmonized cross-SDK catalog using real Prometheus `Histogram`s for timing and size, replacing the legacy quantile-gauge timing shape. New canonical-only metrics: `task_poll_error_total`, `task_execution_started_total`, `task_result_size_bytes`, `workflow_input_size_bytes`, `http_api_client_request_seconds`, `active_workers`. Time buckets `0.001…10s`; size buckets `100…10_000_000` bytes. + - `metrics_factory.create_metrics_collector(settings)` selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive, whitespace-trimmed). `WORKER_LEGACY_METRICS` is documented but not yet read. + - New abstract `MetricsCollectorBase` consolidates Prometheus infrastructure (lazy `prometheus_client` imports, multiprocess `NoPidCollector` aggregation, HTTP server, exception-label cardinality bounding) and event handlers shared by both collectors. + - `(Async)TaskRunner` now records `task_update_time` (`status="SUCCESS"` / `"FAILURE"`) on every update path. + - `OrkesWorkflowClient.start_workflow*` records workflow input payload size and increments `workflow_start_error` on exception; `OrkesClients` / `OrkesBaseClient` accept an optional `metrics_collector`. + - `MetricsSettings(clean_directory=True)` removes leftover `*.db` files in the multiprocess directory at init. + - `CONDUCTOR_MP_START_METHOD` env var (`spawn` / `fork` / `forkserver`; default `fork` on POSIX, `spawn` on Windows) to control the worker pool's multiprocessing start method (motivated by a `prometheus_client` lock-fork deadlock). + - Harness manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.py` logs which collector is active. + +### Changed + +- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset + - `MetricLabel.PAYLOAD_TYPE` value changed from `"payload_type"` to `"payloadType"` to align with canonical camelCase labels; `PAYLOAD_TYPE_LEGACY = "payload_type"` was added so the legacy collector keeps emitting the snake-case label on `external_payload_used_total`. + - `metrics_collector.py` is now a thin compatibility shim: `MetricsCollector = LegacyMetricsCollector`, so `from conductor.client.telemetry.metrics_collector import MetricsCollector` continues to work. + - Default behavior is unchanged: with no env var set, the legacy metric names, label conventions, and quantile-gauge timing shape from prior releases are preserved. + - Rewrote `METRICS.md` to document both surfaces, the env-var gate, full canonical and legacy catalogs, labels, a "Migrating From Legacy to Canonical" mapping (including the `payload_type` → `payloadType` label change and PromQL replacements), and troubleshooting. + - Updated `README.md`, `WORKER_CONFIGURATION.md`, and `docs/design/WORKER_DESIGN.md` to point at `METRICS.md`. From cf49ea8476b780ca4c9c3b20645a08c37a845c91 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 11 May 2026 12:24:47 -0600 Subject: [PATCH 09/24] wip fix payload type to not break upgraders that might refer to it --- CHANGELOG.md | 2 +- src/conductor/client/telemetry/canonical_metrics_collector.py | 2 +- src/conductor/client/telemetry/legacy_metrics_collector.py | 2 +- src/conductor/client/telemetry/model/metric_label.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a7bb8067..2cedc2b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset - - `MetricLabel.PAYLOAD_TYPE` value changed from `"payload_type"` to `"payloadType"` to align with canonical camelCase labels; `PAYLOAD_TYPE_LEGACY = "payload_type"` was added so the legacy collector keeps emitting the snake-case label on `external_payload_used_total`. + - `MetricLabel.PAYLOAD_TYPE` retains its original value `"payload_type"`; a new `PAYLOAD_TYPE_CAMEL = "payloadType"` constant is used only by the canonical collector on `external_payload_used_total`. - `metrics_collector.py` is now a thin compatibility shim: `MetricsCollector = LegacyMetricsCollector`, so `from conductor.client.telemetry.metrics_collector import MetricsCollector` continues to work. - Default behavior is unchanged: with no env var set, the legacy metric names, label conventions, and quantile-gauge timing shape from prior releases are preserved. - Rewrote `METRICS.md` to document both surfaces, the env-var gate, full canonical and legacy catalogs, labels, a "Migrating From Legacy to Canonical" mapping (including the `payload_type` → `payloadType` label change and PromQL replacements), and troubleshooting. diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index 42562a8bb..255c866b8 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -134,7 +134,7 @@ def increment_external_payload_used(self, entity_name: str, operation: str, payl labels={ MetricLabel.ENTITY_NAME: entity_name, MetricLabel.OPERATION: operation, - MetricLabel.PAYLOAD_TYPE: payload_type, + MetricLabel.PAYLOAD_TYPE_CAMEL: payload_type, }, ) diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index 418e56da0..9fe58b581 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -118,7 +118,7 @@ def increment_external_payload_used(self, entity_name: str, operation: str, payl labels={ MetricLabel.ENTITY_NAME: entity_name, MetricLabel.OPERATION: operation, - MetricLabel.PAYLOAD_TYPE_LEGACY: payload_type, + MetricLabel.PAYLOAD_TYPE: payload_type, }, ) diff --git a/src/conductor/client/telemetry/model/metric_label.py b/src/conductor/client/telemetry/model/metric_label.py index 8d4a63f70..96767959d 100644 --- a/src/conductor/client/telemetry/model/metric_label.py +++ b/src/conductor/client/telemetry/model/metric_label.py @@ -6,8 +6,8 @@ class MetricLabel(str, Enum): EXCEPTION = "exception" METHOD = "method" OPERATION = "operation" - PAYLOAD_TYPE = "payloadType" - PAYLOAD_TYPE_LEGACY = "payload_type" + PAYLOAD_TYPE = "payload_type" + PAYLOAD_TYPE_CAMEL = "payloadType" STATUS = "status" TASK_TYPE = "taskType" URI = "uri" From d2c5fa09be0c9014bcb14027552ce5ef4057c4ec Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 11 May 2026 12:45:15 -0600 Subject: [PATCH 10/24] wip attempt to address directory and db clobbering concern --- CHANGELOG.md | 3 +- METRICS.md | 8 ++-- harness/main.py | 6 ++- .../settings/metrics_settings.py | 45 +++++++++++++++---- .../client/telemetry/metrics_factory.py | 23 ++++++++-- 5 files changed, 68 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cedc2b89..0640187b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New abstract `MetricsCollectorBase` consolidates Prometheus infrastructure (lazy `prometheus_client` imports, multiprocess `NoPidCollector` aggregation, HTTP server, exception-label cardinality bounding) and event handlers shared by both collectors. - `(Async)TaskRunner` now records `task_update_time` (`status="SUCCESS"` / `"FAILURE"`) on every update path. - `OrkesWorkflowClient.start_workflow*` records workflow input payload size and increments `workflow_start_error` on exception; `OrkesClients` / `OrkesBaseClient` accept an optional `metrics_collector`. - - `MetricsSettings(clean_directory=True)` removes leftover `*.db` files in the multiprocess directory at init. + - `create_metrics_collector` partitions the metrics directory into a `legacy/` or `canonical/` subdirectory, so switching implementations never produces stale metric names from the previous type. + - `MetricsSettings` gains `clean_directory` (default `False`) to wipe all `.db` files and `clean_dead_pids` (default `False`) to remove only `.db` files from PIDs that no longer exist. Both are executed by the factory against the final partitioned subdirectory. - `CONDUCTOR_MP_START_METHOD` env var (`spawn` / `fork` / `forkserver`; default `fork` on POSIX, `spawn` on Windows) to control the worker pool's multiprocessing start method (motivated by a `prometheus_client` lock-fork deadlock). - Harness manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.py` logs which collector is active. diff --git a/METRICS.md b/METRICS.md index 24bc39ac3..379185551 100644 --- a/METRICS.md +++ b/METRICS.md @@ -277,9 +277,11 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ### Stale or Unexpected Series -- Use a dedicated `directory` for each worker process group. -- Leave `clean_directory=True` on `MetricsSettings` unless another process owns - the same Prometheus multiprocess directory. +- The factory partitions the metrics directory into `legacy/` or `canonical/` + subdirectories, so switching implementations never mixes stale metric names. +- Pass `clean_dead_pids=True` to `MetricsSettings` to remove `.db` files from + PIDs that no longer exist. Use `clean_directory=True` only when you are sure + no other live process shares the same directory. - Restart workers after changing `WORKER_CANONICAL_METRICS`. ### High Cardinality diff --git a/harness/main.py b/harness/main.py index f556fabed..448240b1f 100644 --- a/harness/main.py +++ b/harness/main.py @@ -70,7 +70,11 @@ def main() -> None: configuration = Configuration() metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) - metrics_settings = MetricsSettings(http_port=metrics_port) + metrics_settings = MetricsSettings( + http_port=metrics_port, + clean_directory=True, + clean_dead_pids=True, + ) metrics_collector = create_metrics_collector(metrics_settings) print(f"Prometheus metrics server started on port {metrics_port} ({metrics_collector.collector_name()} metrics)") diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index 4d4bb6eec..efe6c3798 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -25,12 +25,17 @@ def __init__( file_name: str = "metrics.log", update_interval: float = 0.1, http_port: Optional[int] = None, - clean_directory: bool = True): + clean_directory: bool = False, + clean_dead_pids: bool = False): """ Configure metrics collection settings. Args: - directory: Directory for storing multiprocess metrics .db files + directory: Base directory for storing multiprocess metrics .db files. + The factory (``create_metrics_collector``) appends a + collector-type subdirectory (``legacy/`` or ``canonical/``) + so that switching implementations never produces stale + metric names. file_name: Name of the metrics output file (only used when http_port is None) update_interval: How often to update metrics (in seconds) http_port: Optional HTTP port to expose metrics endpoint for Prometheus scraping. @@ -41,10 +46,13 @@ def __init__( If None: - Metrics will be written to file at {directory}/{file_name} - No HTTP server will be started - clean_directory: If True (default), remove stale prometheus_client - .db files from the directory on init. Without this, - metric names from previous configurations persist in the - multiprocess directory and appear in /metrics output. + clean_directory: If True, remove all prometheus_client .db files from + the metrics directory when the collector is created. Only + safe when no other live process shares the same directory. + Defaults to False. + clean_dead_pids: If True, remove .db files whose owning PID no + longer exists. Safer than ``clean_directory`` in shared + environments. Defaults to False. """ if directory is None: directory = get_default_temporary_folder() @@ -52,8 +60,8 @@ def __init__( self.file_name = file_name self.update_interval = update_interval self.http_port = http_port - if clean_directory: - self._clean_stale_db_files() + self.clean_directory = clean_directory + self.clean_dead_pids = clean_dead_pids def __set_dir(self, dir: str) -> None: if not os.path.isdir(dir): @@ -66,7 +74,7 @@ def __set_dir(self, dir: str) -> None: self.directory = dir def _clean_stale_db_files(self) -> None: - """Remove leftover prometheus_client multiprocess .db files.""" + """Remove all prometheus_client multiprocess .db files.""" import glob pattern = os.path.join(self.directory, "*.db") for path in glob.glob(pattern): @@ -74,3 +82,22 @@ def _clean_stale_db_files(self) -> None: os.remove(path) except Exception as e: logger.debug("Could not remove stale metrics db file %s: %s", path, e) + + def _clean_dead_pid_files(self) -> None: + """Remove .db files whose owning PID no longer exists.""" + import glob + import re + pattern = os.path.join(self.directory, "*.db") + for path in glob.glob(pattern): + match = re.search(r'_(\d+)\.db$', os.path.basename(path)) + if not match: + continue + pid = int(match.group(1)) + try: + os.kill(pid, 0) # Check if the process is still running (0 = simple probe, doesn't send a signal), if it isn't we'll get a ProcessLookupError which is an OSError + except OSError: + try: + os.remove(path) + logger.debug("Removed dead-pid metrics file %s (pid %d)", path, pid) + except Exception as e: + logger.debug("Could not remove dead-pid metrics db file %s: %s", path, e) diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index 3b3e4187c..737202257 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -32,14 +32,31 @@ def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: """ Create the metrics collector selected by environment variables. + Appends a collector-type subdirectory (``legacy/`` or ``canonical/``) to + ``settings.directory`` so that switching implementations never produces + stale metric names from the previous type. Cleanup flags on *settings* + (``clean_directory``, ``clean_dead_pids``) are executed against the final + subdirectory before the collector is constructed. + Returns a fully-initialised collector (legacy or canonical) that satisfies the MetricsCollector Protocol and can be registered as an event listener. """ - if _env_bool("WORKER_CANONICAL_METRICS", default=False): + collector_type = "canonical" if _env_bool("WORKER_CANONICAL_METRICS", default=False) else "legacy" + + partitioned_dir = os.path.join(settings.directory, collector_type) + os.makedirs(partitioned_dir, exist_ok=True) + settings.directory = partitioned_dir + + if settings.clean_directory: + settings._clean_stale_db_files() + if settings.clean_dead_pids: + settings._clean_dead_pid_files() + + if collector_type == "canonical": from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector - logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector") + logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector (dir=%s)", partitioned_dir) return CanonicalMetricsCollector(settings) from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector - logger.info("Using LegacyMetricsCollector (set WORKER_CANONICAL_METRICS=true for canonical metrics)") + logger.info("Using LegacyMetricsCollector (dir=%s; set WORKER_CANONICAL_METRICS=true for canonical)", partitioned_dir) return LegacyMetricsCollector(settings) From ce03f7143c26f2d4addb9fb85c80d8418eebad41 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 11 May 2026 12:52:01 -0600 Subject: [PATCH 11/24] wip, attempting to not impose new behaviors on upgraders --- CHANGELOG.md | 2 +- .../client/orkes/orkes_workflow_client.py | 41 ++++++------------- .../telemetry/canonical_metrics_collector.py | 18 ++++++++ .../telemetry/legacy_metrics_collector.py | 7 +--- .../telemetry/metrics_collector_base.py | 15 +++++++ 5 files changed, 47 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0640187b4..3d562fd38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `metrics_factory.create_metrics_collector(settings)` selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive, whitespace-trimmed). `WORKER_LEGACY_METRICS` is documented but not yet read. - New abstract `MetricsCollectorBase` consolidates Prometheus infrastructure (lazy `prometheus_client` imports, multiprocess `NoPidCollector` aggregation, HTTP server, exception-label cardinality bounding) and event handlers shared by both collectors. - `(Async)TaskRunner` now records `task_update_time` (`status="SUCCESS"` / `"FAILURE"`) on every update path. - - `OrkesWorkflowClient.start_workflow*` records workflow input payload size and increments `workflow_start_error` on exception; `OrkesClients` / `OrkesBaseClient` accept an optional `metrics_collector`. + - `OrkesWorkflowClient.start_workflow*` calls `measure_workflow_input_payload_size` and `measure_workflow_start_error` on the collector; canonical records workflow input size and start errors, legacy no-ops (preserving existing behavior). `OrkesClients` / `OrkesBaseClient` accept an optional `metrics_collector`. - `create_metrics_collector` partitions the metrics directory into a `legacy/` or `canonical/` subdirectory, so switching implementations never produces stale metric names from the previous type. - `MetricsSettings` gains `clean_directory` (default `False`) to wipe all `.db` files and `clean_dead_pids` (default `False`) to remove only `.db` files from PIDs that no longer exist. Both are executed by the factory against the final partitioned subdirectory. - `CONDUCTOR_MP_START_METHOD` env var (`spawn` / `fork` / `forkserver`; default `fork` on POSIX, `spawn` on Windows) to control the worker pool's multiprocessing start method (motivated by a `prometheus_client` lock-fork deadlock). diff --git a/src/conductor/client/orkes/orkes_workflow_client.py b/src/conductor/client/orkes/orkes_workflow_client.py index 7ef501c07..931dd6282 100644 --- a/src/conductor/client/orkes/orkes_workflow_client.py +++ b/src/conductor/client/orkes/orkes_workflow_client.py @@ -1,5 +1,4 @@ from __future__ import annotations -import json from typing import Optional, List, Dict from conductor.client.configuration.configuration import Configuration @@ -40,45 +39,29 @@ def start_workflow_by_name( if priority: kwargs.update({"priority": priority}) - self._record_workflow_input_payload_size(name, version, input) + if self.metrics_collector is not None: + self.metrics_collector.measure_workflow_input_payload_size(name, version, input) try: return self.workflowResourceApi.start_workflow1(input, name, **kwargs) except Exception as e: - self._record_workflow_start_error(name, e) + if self.metrics_collector is not None: + self.metrics_collector.measure_workflow_start_error(name, e) raise def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: - self._record_workflow_input_payload_size( - start_workflow_request.name, - start_workflow_request.version, - getattr(start_workflow_request, "input", None), - ) + if self.metrics_collector is not None: + self.metrics_collector.measure_workflow_input_payload_size( + start_workflow_request.name, + start_workflow_request.version, + getattr(start_workflow_request, "input", None), + ) try: return self.workflowResourceApi.start_workflow(start_workflow_request) except Exception as e: - self._record_workflow_start_error(start_workflow_request.name, e) + if self.metrics_collector is not None: + self.metrics_collector.measure_workflow_start_error(start_workflow_request.name, e) raise - def _record_workflow_input_payload_size(self, name: str, version, workflow_input) -> None: - if self.metrics_collector is None: - return - try: - encoded = json.dumps(workflow_input if workflow_input is not None else {}, default=str) - size_bytes = len(encoded.encode("utf-8")) - self.metrics_collector.record_workflow_input_payload_size( - name, str(version) if version is not None else "", size_bytes, - ) - except Exception: - pass - - def _record_workflow_start_error(self, name: str, exception: Exception) -> None: - if self.metrics_collector is None: - return - try: - self.metrics_collector.increment_workflow_start_error(name, exception) - except Exception: - pass - def execute_workflow( self, start_workflow_request: StartWorkflowRequest, diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index 255c866b8..d9f5217f9 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -228,3 +228,21 @@ def record_workflow_input_payload_size(self, workflow_type: str, version: str, p value=payload_size, buckets=SIZE_BUCKETS, ) + + # ------------------------------------------------------------------ + # Workflow-client hooks (canonical-only) + # ------------------------------------------------------------------ + + def measure_workflow_input_payload_size(self, name: str, version, workflow_input) -> None: + import json + try: + encoded = json.dumps(workflow_input if workflow_input is not None else {}, default=str) + size_bytes = len(encoded.encode("utf-8")) + self.record_workflow_input_payload_size( + name, str(version) if version is not None else "", size_bytes, + ) + except Exception: + pass + + def measure_workflow_start_error(self, name: str, exception: Exception) -> None: + self.increment_workflow_start_error(name, exception) diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index 9fe58b581..dde21a67b 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -172,12 +172,7 @@ def record_task_execute_time(self, task_type: str, time_spent: float, status: st ) def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - self._record_quantiles( - name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, - labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, - value=time_spent, - ) + pass # canonical-only; call sites were added in this branch and didn't exist on main def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: self._record_quantiles( diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 358012d8b..863a8618e 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -428,6 +428,21 @@ def record_workflow_input_payload_size(self, workflow_type: str, version: str, p @abc.abstractmethod def set_active_workers(self, task_type: str, count: int) -> None: ... + # ========================================================================= + # Workflow-client hooks. + # Called by OrkesWorkflowClient on start_workflow paths. No-ops by + # default so legacy mode adds zero overhead and emits no new metrics. + # Canonical overrides these to measure payload size and record errors. + # ========================================================================= + + def measure_workflow_input_payload_size(self, name: str, version, workflow_input) -> None: + """Measure and record the serialised size of *workflow_input*. No-op by default.""" + pass + + def measure_workflow_start_error(self, name: str, exception: Exception) -> None: + """Record a workflow-start error from the client call-site. No-op by default.""" + pass + # ========================================================================= # Concrete event handlers -- delegate to the abstract metric methods. # These satisfy the event listener protocols in event/listeners.py. From 9d118b1285141ba2063f315a2743f3a578bd6b88 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 08:40:59 -0600 Subject: [PATCH 12/24] directory handling for metrics db location was broken by recent change --- .../client/automator/task_handler.py | 4 +-- .../settings/metrics_settings.py | 25 ++++++++++++++++--- .../telemetry/metrics_collector_base.py | 8 +++--- .../client/telemetry/metrics_factory.py | 19 +++++++------- 4 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index 74dc62731..c8bb7b532 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -235,8 +235,8 @@ def __init__( # Set prometheus multiprocess directory BEFORE any worker processes start # This must be done before prometheus_client is imported in worker processes if metrics_settings is not None: - os.environ["PROMETHEUS_MULTIPROC_DIR"] = metrics_settings.directory - logger.info(f"Set PROMETHEUS_MULTIPROC_DIR={metrics_settings.directory}") + os.environ["PROMETHEUS_MULTIPROC_DIR"] = metrics_settings.metrics_directory + logger.info(f"Set PROMETHEUS_MULTIPROC_DIR={metrics_settings.metrics_directory}") # Store event listeners to pass to each worker process self.event_listeners = event_listeners or [] diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index efe6c3798..a4e263147 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -62,6 +62,25 @@ def __init__( self.http_port = http_port self.clean_directory = clean_directory self.clean_dead_pids = clean_dead_pids + self._collector_subdir: str = "" + self._metrics_directory: str = self.directory + + @property + def collector_subdir(self) -> str: + return self._collector_subdir + + @collector_subdir.setter + def collector_subdir(self, value: str) -> None: + self._collector_subdir = value + if value: + self._metrics_directory = os.path.join(self.directory, value) + else: + self._metrics_directory = self.directory + + @property + def metrics_directory(self) -> str: + """Full path where .db files live (base directory + collector subdir).""" + return self._metrics_directory def __set_dir(self, dir: str) -> None: if not os.path.isdir(dir): @@ -76,7 +95,7 @@ def __set_dir(self, dir: str) -> None: def _clean_stale_db_files(self) -> None: """Remove all prometheus_client multiprocess .db files.""" import glob - pattern = os.path.join(self.directory, "*.db") + pattern = os.path.join(self.metrics_directory, "*.db") for path in glob.glob(pattern): try: os.remove(path) @@ -87,14 +106,14 @@ def _clean_dead_pid_files(self) -> None: """Remove .db files whose owning PID no longer exists.""" import glob import re - pattern = os.path.join(self.directory, "*.db") + pattern = os.path.join(self.metrics_directory, "*.db") for path in glob.glob(pattern): match = re.search(r'_(\d+)\.db$', os.path.basename(path)) if not match: continue pid = int(match.group(1)) try: - os.kill(pid, 0) # Check if the process is still running (0 = simple probe, doesn't send a signal), if it isn't we'll get a ProcessLookupError which is an OSError + os.kill(pid, 0) except OSError: try: os.remove(path) diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 863a8618e..39ab58c6a 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -112,7 +112,7 @@ def __init__(self, settings: MetricsSettings): if settings is None: return - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory + os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.metrics_directory _ensure_prometheus_imported() @@ -121,7 +121,7 @@ def __init__(self, settings: MetricsSettings): self.must_collect_metrics = True logger.debug( "MetricsCollector initialized with directory=%s, must_collect=%s", - settings.directory, + settings.metrics_directory, self.must_collect_metrics, ) @@ -142,12 +142,12 @@ def provide_metrics(settings: MetricsSettings) -> None: except Exception: pass - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory + os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.metrics_directory _ensure_prometheus_imported() OUTPUT_FILE_PATH = os.path.join( - settings.directory, + settings.metrics_directory, settings.file_name ) diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index 737202257..da2098f8e 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -32,20 +32,19 @@ def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: """ Create the metrics collector selected by environment variables. - Appends a collector-type subdirectory (``legacy/`` or ``canonical/``) to - ``settings.directory`` so that switching implementations never produces - stale metric names from the previous type. Cleanup flags on *settings* - (``clean_directory``, ``clean_dead_pids``) are executed against the final - subdirectory before the collector is constructed. + Sets ``settings.collector_subdir`` to ``"legacy"`` or ``"canonical"`` so + that ``settings.metrics_directory`` resolves to a type-specific + subdirectory. This is idempotent: calling the factory more than once on + the same *settings* object (e.g. once in the main process and again in + each forked worker) always produces the same directory. Returns a fully-initialised collector (legacy or canonical) that satisfies the MetricsCollector Protocol and can be registered as an event listener. """ collector_type = "canonical" if _env_bool("WORKER_CANONICAL_METRICS", default=False) else "legacy" - partitioned_dir = os.path.join(settings.directory, collector_type) - os.makedirs(partitioned_dir, exist_ok=True) - settings.directory = partitioned_dir + settings.collector_subdir = collector_type + os.makedirs(settings.metrics_directory, exist_ok=True) if settings.clean_directory: settings._clean_stale_db_files() @@ -54,9 +53,9 @@ def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: if collector_type == "canonical": from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector - logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector (dir=%s)", partitioned_dir) + logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector (dir=%s)", settings.metrics_directory) return CanonicalMetricsCollector(settings) from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector - logger.info("Using LegacyMetricsCollector (dir=%s; set WORKER_CANONICAL_METRICS=true for canonical)", partitioned_dir) + logger.info("Using LegacyMetricsCollector (dir=%s; set WORKER_CANONICAL_METRICS=true for canonical)", settings.metrics_directory) return LegacyMetricsCollector(settings) From c65cf83f540cea4eeb4d1284eecdba1da11a037d Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 10:02:09 -0600 Subject: [PATCH 13/24] wip, dealing with cardinality in url labels of metrics --- src/conductor/client/http/api_client.py | 15 ++++++++++----- src/conductor/client/http/async_api_client.py | 14 ++++++++++---- .../telemetry/canonical_metrics_collector.py | 5 +++-- .../client/telemetry/legacy_metrics_collector.py | 3 ++- .../client/telemetry/metrics_collector_base.py | 3 ++- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/conductor/client/http/api_client.py b/src/conductor/client/http/api_client.py index 761f78e28..54b208dab 100644 --- a/src/conductor/client/http/api_client.py +++ b/src/conductor/client/http/api_client.py @@ -26,7 +26,6 @@ ) ) - class ApiClient(object): PRIMITIVE_TYPES = (float, bool, bytes, six.text_type) + six.integer_types NATIVE_TYPES_MAPPING = { @@ -124,6 +123,9 @@ def __call_api_no_retry( header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) + # Save the URI template before path param substitution for metrics + metric_uri = resource_path + # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) @@ -171,7 +173,8 @@ def __call_api_no_retry( method, url, query_params=query_params, headers=header_params, post_params=post_params, body=body, _preload_content=_preload_content, - _request_timeout=_request_timeout) + _request_timeout=_request_timeout, + metric_uri=metric_uri) return_data = response_data if _preload_content: @@ -386,7 +389,7 @@ def call_api(self, resource_path, method, def request(self, method, url, query_params=None, headers=None, post_params=None, body=None, _preload_content=True, - _request_timeout=None): + _request_timeout=None, metric_uri=None): """Makes the HTTP request using RESTClient.""" # Extract URI path from URL (remove query params and domain) try: @@ -468,7 +471,8 @@ def request(self, method, url, query_params=None, headers=None, method=method, uri=uri, status=status_code, - time_spent=elapsed_time + time_spent=elapsed_time, + metric_uri=metric_uri, ) return response @@ -489,7 +493,8 @@ def request(self, method, url, query_params=None, headers=None, method=method, uri=uri, status=status_code, - time_spent=elapsed_time + time_spent=elapsed_time, + metric_uri=metric_uri, ) # Re-raise the exception diff --git a/src/conductor/client/http/async_api_client.py b/src/conductor/client/http/async_api_client.py index 3606573a2..6b094d4bd 100644 --- a/src/conductor/client/http/async_api_client.py +++ b/src/conductor/client/http/async_api_client.py @@ -135,6 +135,9 @@ async def __call_api_no_retry( header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) + # Save the URI template before path param substitution for metrics + metric_uri = resource_path + # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) @@ -182,7 +185,8 @@ async def __call_api_no_retry( method, url, query_params=query_params, headers=header_params, post_params=post_params, body=body, _preload_content=_preload_content, - _request_timeout=_request_timeout) + _request_timeout=_request_timeout, + metric_uri=metric_uri) return_data = response_data if _preload_content: @@ -376,7 +380,7 @@ async def call_api(self, resource_path, method, async def request(self, method, url, query_params=None, headers=None, post_params=None, body=None, _preload_content=True, - _request_timeout=None): + _request_timeout=None, metric_uri=None): """Makes the async HTTP request using AsyncRESTClient.""" # Extract URI path from URL (remove query params and domain) try: @@ -458,7 +462,8 @@ async def request(self, method, url, query_params=None, headers=None, method=method, uri=uri, status=status_code, - time_spent=elapsed_time + time_spent=elapsed_time, + metric_uri=metric_uri, ) return response @@ -479,7 +484,8 @@ async def request(self, method, url, query_params=None, headers=None, method=method, uri=uri, status=status_code, - time_spent=elapsed_time + time_spent=elapsed_time, + metric_uri=metric_uri, ) # Re-raise the exception diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index d9f5217f9..79f5a66d2 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -191,13 +191,14 @@ def record_task_update_time(self, task_type: str, time_spent: float, status: str buckets=TIME_BUCKETS, ) - def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, + metric_uri: str = None) -> None: self._observe_histogram( name=MetricName.API_REQUEST_TIME_CANONICAL, documentation=MetricDocumentation.API_REQUEST_TIME_CANONICAL, labels={ MetricLabel.METHOD: method, - MetricLabel.URI: uri, + MetricLabel.URI: metric_uri or uri, MetricLabel.STATUS: status, }, value=time_spent, diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index dde21a67b..1ce724058 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -174,7 +174,8 @@ def record_task_execute_time(self, task_type: str, time_spent: float, status: st def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: pass # canonical-only; call sites were added in this branch and didn't exist on main - def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, + metric_uri: str = None) -> None: self._record_quantiles( name=MetricName.API_REQUEST_TIME, documentation=MetricDocumentation.API_REQUEST_TIME, diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 39ab58c6a..2a24430ca 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -417,7 +417,8 @@ def record_task_execute_time(self, task_type: str, time_spent: float, status: st def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... @abc.abstractmethod - def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: ... + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, + metric_uri: str = None) -> None: ... @abc.abstractmethod def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: ... From b2978cbd41b2651bc929f81e19a3346de3d75381 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 10:23:12 -0600 Subject: [PATCH 14/24] getting template strings output in canonical metrics uris - wip --- harness/main.py | 13 ++++- harness/workflow_governor.py | 7 ++- harness/workflow_status_probe.py | 81 ++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 harness/workflow_status_probe.py diff --git a/harness/main.py b/harness/main.py index 448240b1f..417ef7b6a 100644 --- a/harness/main.py +++ b/harness/main.py @@ -15,6 +15,7 @@ from simulated_task_worker import SimulatedTaskWorker from workflow_governor import WorkflowGovernor +from workflow_status_probe import WorkflowStatusProbe WORKFLOW_NAME = "python_simulated_tasks_workflow" @@ -99,7 +100,15 @@ def main() -> None: ) workflow_executor = clients.get_workflow_executor() - governor = WorkflowGovernor(workflow_executor, WORKFLOW_NAME, workflows_per_sec) + workflow_client = clients.get_workflow_client() + + probe_rate = env_int_or_default("HARNESS_PROBE_RATE_PER_SEC", 0) + probe = WorkflowStatusProbe(workflow_client, probe_rate) + + governor = WorkflowGovernor( + workflow_executor, WORKFLOW_NAME, workflows_per_sec, + id_sink=probe.offer, + ) main_pid = os.getpid() shutting_down = False @@ -110,6 +119,7 @@ def shutdown(signum, frame): return shutting_down = True print("Shutting down...") + probe.shutdown() governor.stop() task_handler.stop_processes() sys.exit(0) @@ -123,6 +133,7 @@ def shutdown(signum, frame): # already running at fork() time, a child can inherit that lock in a held # state and deadlock on its first metric write. task_handler.start_processes() + probe.start() governor.start() task_handler.join_processes() diff --git a/harness/workflow_governor.py b/harness/workflow_governor.py index 38e92f5cf..0c9b9311b 100644 --- a/harness/workflow_governor.py +++ b/harness/workflow_governor.py @@ -2,6 +2,7 @@ import threading import time +from typing import Callable, Optional from conductor.client.http.models import StartWorkflowRequest from conductor.client.workflow.executor.workflow_executor import WorkflowExecutor @@ -14,10 +15,12 @@ def __init__( workflow_executor: WorkflowExecutor, workflow_name: str, workflows_per_second: int, + id_sink: Optional[Callable[[str], None]] = None, ) -> None: self._workflow_executor = workflow_executor self._workflow_name = workflow_name self._workflows_per_second = workflows_per_second + self._id_sink = id_sink self._stop_event = threading.Event() self._thread: threading.Thread | None = None @@ -45,7 +48,9 @@ def _start_batch(self) -> None: try: for _ in range(self._workflows_per_second): request = StartWorkflowRequest(name=self._workflow_name, version=1) - self._workflow_executor.start_workflow(request) + workflow_id = self._workflow_executor.start_workflow(request) + if self._id_sink is not None and workflow_id: + self._id_sink(workflow_id) print(f"Governor: started {self._workflows_per_second} workflow(s)") except Exception as e: print(f"Governor: error starting workflows: {e}") diff --git a/harness/workflow_status_probe.py b/harness/workflow_status_probe.py new file mode 100644 index 000000000..8df918f55 --- /dev/null +++ b/harness/workflow_status_probe.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import collections +import random +import threading +from typing import Optional + +from conductor.client.workflow_client import WorkflowClient + + +class WorkflowStatusProbe: + """Probe that exercises UUID-bearing workflow lookup endpoints so + ``http_api_client_request_seconds`` picks up entries with + ``uri=/workflow/{workflowId}`` and ``uri=/workflow/{workflowId}/status``. + + Default off. Runs only when ``HARNESS_PROBE_RATE_PER_SEC`` > 0. + + Side-effect-free: only issues read calls (``get_workflow`` and + ``get_workflow_status``). + + Self-bounded: recently-started workflow IDs are kept in a fixed-size + FIFO so memory is constant regardless of harness uptime. + """ + + MAX_TRACKED_IDS = 256 + + def __init__(self, workflow_client: WorkflowClient, calls_per_second: int) -> None: + self._workflow_client = workflow_client + self._calls_per_second = calls_per_second + self._recent_ids: collections.deque[str] = collections.deque(maxlen=self.MAX_TRACKED_IDS) + self._lock = threading.Lock() + self._stop_event = threading.Event() + self._thread: Optional[threading.Thread] = None + + def offer(self, workflow_id: str) -> None: + """Capture a workflow ID for later probing. Thread-safe.""" + if not workflow_id: + return + with self._lock: + self._recent_ids.appendleft(workflow_id) + + def start(self) -> None: + if self._calls_per_second <= 0: + print("WorkflowStatusProbe disabled (HARNESS_PROBE_RATE_PER_SEC<=0)") + return + print( + f"WorkflowStatusProbe started: rate={self._calls_per_second}/sec, " + f"retainedIds<={self.MAX_TRACKED_IDS}" + ) + self._stop_event.clear() + self._thread = threading.Thread(target=self._run, name="WorkflowStatusProbe", daemon=True) + self._thread.start() + + def shutdown(self) -> None: + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=5.0) + + def _run(self) -> None: + while not self._stop_event.is_set(): + self._tick() + self._stop_event.wait(timeout=1.0) + + def _tick(self) -> None: + with self._lock: + budget = min(self._calls_per_second, len(self._recent_ids)) + if budget == 0: + return + ids_to_probe = [self._recent_ids[i] for i in range(budget)] + # Rotate: move probed IDs to the back + for _ in range(budget): + self._recent_ids.rotate(-1) + + for wf_id in ids_to_probe: + try: + if random.random() < 0.5: + self._workflow_client.get_workflow(wf_id, include_tasks=False) + else: + self._workflow_client.get_workflow_status(wf_id) + except Exception: + pass From 39d59e0af3d892c2a9da40755b97882a7dab87d4 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 10:50:45 -0600 Subject: [PATCH 15/24] fix startup issue with db cleanup and child processes --- .../client/configuration/settings/metrics_settings.py | 1 + src/conductor/client/telemetry/metrics_factory.py | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index a4e263147..cbe0e5735 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -62,6 +62,7 @@ def __init__( self.http_port = http_port self.clean_directory = clean_directory self.clean_dead_pids = clean_dead_pids + self._owner_pid: Optional[int] = None self._collector_subdir: str = "" self._metrics_directory: str = self.directory diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index da2098f8e..33b754dfa 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -46,10 +46,13 @@ def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: settings.collector_subdir = collector_type os.makedirs(settings.metrics_directory, exist_ok=True) - if settings.clean_directory: - settings._clean_stale_db_files() - if settings.clean_dead_pids: - settings._clean_dead_pid_files() + is_owner = settings._owner_pid is None or os.getpid() == settings._owner_pid + if is_owner: + settings._owner_pid = os.getpid() + if settings.clean_directory: + settings._clean_stale_db_files() + if settings.clean_dead_pids: + settings._clean_dead_pid_files() if collector_type == "canonical": from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector From ca9264778413b7b101de44807736a1418b157993 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 12:06:10 -0600 Subject: [PATCH 16/24] addressing hygeine issues --- CHANGELOG.md | 4 +- METRICS.md | 5 +- .../client/automator/task_handler.py | 7 ++- .../settings/metrics_settings.py | 36 ++++++-------- .../client/orkes/orkes_workflow_client.py | 28 +++++++---- .../telemetry/canonical_metrics_collector.py | 1 + .../telemetry/metrics_collector_base.py | 10 ++-- .../client/telemetry/metrics_factory.py | 47 +++++++++++++------ 8 files changed, 85 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d562fd38..3471e6a72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,8 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New abstract `MetricsCollectorBase` consolidates Prometheus infrastructure (lazy `prometheus_client` imports, multiprocess `NoPidCollector` aggregation, HTTP server, exception-label cardinality bounding) and event handlers shared by both collectors. - `(Async)TaskRunner` now records `task_update_time` (`status="SUCCESS"` / `"FAILURE"`) on every update path. - `OrkesWorkflowClient.start_workflow*` calls `measure_workflow_input_payload_size` and `measure_workflow_start_error` on the collector; canonical records workflow input size and start errors, legacy no-ops (preserving existing behavior). `OrkesClients` / `OrkesBaseClient` accept an optional `metrics_collector`. - - `create_metrics_collector` partitions the metrics directory into a `legacy/` or `canonical/` subdirectory, so switching implementations never produces stale metric names from the previous type. - - `MetricsSettings` gains `clean_directory` (default `False`) to wipe all `.db` files and `clean_dead_pids` (default `False`) to remove only `.db` files from PIDs that no longer exist. Both are executed by the factory against the final partitioned subdirectory. + - Legacy metrics use the base directory unchanged from prior releases; canonical metrics use a `canonical/` subdirectory so that switching implementations never produces stale metric names. `MetricsSettings.metrics_directory` is a computed property that derives the correct path from `WORKER_CANONICAL_METRICS` without mutating state, ensuring consistent paths across all processes (main, workers, MetricsProvider). + - `MetricsSettings` gains `clean_directory` (default `False`) to wipe all `.db` files and `clean_dead_pids` (default `False`) to remove only `.db` files from PIDs that no longer exist. Both are executed by the factory against the resolved metrics directory. - `CONDUCTOR_MP_START_METHOD` env var (`spawn` / `fork` / `forkserver`; default `fork` on POSIX, `spawn` on Windows) to control the worker pool's multiprocessing start method (motivated by a `prometheus_client` lock-fork deadlock). - Harness manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.py` logs which collector is active. diff --git a/METRICS.md b/METRICS.md index 379185551..c0433c3b4 100644 --- a/METRICS.md +++ b/METRICS.md @@ -277,8 +277,9 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ### Stale or Unexpected Series -- The factory partitions the metrics directory into `legacy/` or `canonical/` - subdirectories, so switching implementations never mixes stale metric names. +- Legacy metrics use the base directory unchanged from prior releases. + Canonical metrics use a `canonical/` subdirectory, so switching + implementations never mixes stale metric names. - Pass `clean_dead_pids=True` to `MetricsSettings` to remove `.db` files from PIDs that no longer exist. Use `clean_directory=True` only when you are sure no other live process shares the same directory. diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index c8bb7b532..8dcf5ba09 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -232,9 +232,12 @@ def __init__( self._configuration = configuration self._metrics_settings = metrics_settings - # Set prometheus multiprocess directory BEFORE any worker processes start - # This must be done before prometheus_client is imported in worker processes + # Resolve the metrics subdirectory and set PROMETHEUS_MULTIPROC_DIR + # BEFORE any worker processes start. resolve_metrics_type is + # idempotent so it's safe if the caller already called it. if metrics_settings is not None: + from conductor.client.telemetry.metrics_factory import resolve_metrics_type + resolve_metrics_type(metrics_settings) os.environ["PROMETHEUS_MULTIPROC_DIR"] = metrics_settings.metrics_directory logger.info(f"Set PROMETHEUS_MULTIPROC_DIR={metrics_settings.metrics_directory}") diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index cbe0e5735..dab240f1d 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -32,10 +32,10 @@ def __init__( Args: directory: Base directory for storing multiprocess metrics .db files. - The factory (``create_metrics_collector``) appends a - collector-type subdirectory (``legacy/`` or ``canonical/``) - so that switching implementations never produces stale - metric names. + Legacy metrics use this directory directly (unchanged from + prior releases). Canonical metrics use a ``canonical/`` + subdirectory so that switching implementations never + produces stale metric names. file_name: Name of the metrics output file (only used when http_port is None) update_interval: How often to update metrics (in seconds) http_port: Optional HTTP port to expose metrics endpoint for Prometheus scraping. @@ -62,26 +62,20 @@ def __init__( self.http_port = http_port self.clean_directory = clean_directory self.clean_dead_pids = clean_dead_pids - self._owner_pid: Optional[int] = None - self._collector_subdir: str = "" - self._metrics_directory: str = self.directory - - @property - def collector_subdir(self) -> str: - return self._collector_subdir - - @collector_subdir.setter - def collector_subdir(self, value: str) -> None: - self._collector_subdir = value - if value: - self._metrics_directory = os.path.join(self.directory, value) - else: - self._metrics_directory = self.directory + self._subdir: Optional[str] = None @property def metrics_directory(self) -> str: - """Full path where .db files live (base directory + collector subdir).""" - return self._metrics_directory + """Full path where .db files live (base directory + optional subdir). + + The subdirectory is set by ``metrics_factory.resolve_metrics_type`` + before any child processes are forked. Legacy leaves it empty (base + directory unchanged from prior releases); canonical sets it to + ``"canonical"`` to avoid stale metric-name collisions. + """ + if self._subdir: + return os.path.join(self.directory, self._subdir) + return self.directory def __set_dir(self, dir: str) -> None: if not os.path.isdir(dir): diff --git a/src/conductor/client/orkes/orkes_workflow_client.py b/src/conductor/client/orkes/orkes_workflow_client.py index 931dd6282..b75453cf1 100644 --- a/src/conductor/client/orkes/orkes_workflow_client.py +++ b/src/conductor/client/orkes/orkes_workflow_client.py @@ -40,26 +40,38 @@ def start_workflow_by_name( kwargs.update({"priority": priority}) if self.metrics_collector is not None: - self.metrics_collector.measure_workflow_input_payload_size(name, version, input) + try: + self.metrics_collector.measure_workflow_input_payload_size(name, version, input) + except Exception: + pass try: return self.workflowResourceApi.start_workflow1(input, name, **kwargs) except Exception as e: if self.metrics_collector is not None: - self.metrics_collector.measure_workflow_start_error(name, e) + try: + self.metrics_collector.measure_workflow_start_error(name, e) + except Exception: + pass raise def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: if self.metrics_collector is not None: - self.metrics_collector.measure_workflow_input_payload_size( - start_workflow_request.name, - start_workflow_request.version, - getattr(start_workflow_request, "input", None), - ) + try: + self.metrics_collector.measure_workflow_input_payload_size( + start_workflow_request.name, + start_workflow_request.version, + getattr(start_workflow_request, "input", None), + ) + except Exception: + pass try: return self.workflowResourceApi.start_workflow(start_workflow_request) except Exception as e: if self.metrics_collector is not None: - self.metrics_collector.measure_workflow_start_error(start_workflow_request.name, e) + try: + self.metrics_collector.measure_workflow_start_error(start_workflow_request.name, e) + except Exception: + pass raise def execute_workflow( diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index 79f5a66d2..d5f18a147 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -158,6 +158,7 @@ def set_active_workers(self, task_type: str, count: int) -> None: documentation=MetricDocumentation.ACTIVE_WORKERS, labels={MetricLabel.TASK_TYPE: task_type}, value=count, + multiprocess_mode='livesum', ) # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 2a24430ca..374a6123b 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -293,7 +293,8 @@ def _record_gauge( name: MetricName, documentation: MetricDocumentation, labels: Dict[MetricLabel, str], - value: Any + value: Any, + multiprocess_mode: str = 'all' ) -> None: if not self.must_collect_metrics: return @@ -301,7 +302,8 @@ def _record_gauge( gauge = self._get_gauge( name=name, documentation=documentation, - labelnames=[label.value for label in labels.keys()] + labelnames=[label.value for label in labels.keys()], + multiprocess_mode=multiprocess_mode ) gauge.labels(*labels.values()).set(value) @@ -334,14 +336,14 @@ def _get_counter(self, name, documentation, labelnames): ) return self.counters[name] - def _get_gauge(self, name, documentation, labelnames): + def _get_gauge(self, name, documentation, labelnames, multiprocess_mode='all'): if name not in self.gauges: self.gauges[name] = Gauge( name=name, documentation=documentation, labelnames=labelnames, registry=self.registry, - multiprocess_mode='all' + multiprocess_mode=multiprocess_mode ) return self.gauges[name] diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index 33b754dfa..ebe7a58c5 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -21,6 +21,9 @@ ) +_CANONICAL_SUBDIR = "canonical" + + def _env_bool(name: str, default: bool) -> bool: value = os.environ.get(name, "") if not value: @@ -28,37 +31,53 @@ def _env_bool(name: str, default: bool) -> bool: return value.strip().lower() in ("true", "1", "yes") +_cleaned_directories: set = set() + + +def resolve_metrics_type(settings: MetricsSettings) -> None: + """Read ``WORKER_CANONICAL_METRICS`` once and store the result on *settings*. + + Idempotent -- subsequent calls are no-ops. Must be called before + ``settings.metrics_directory`` is read so that the subdirectory is + resolved in the main process before any child processes are forked. + + Both ``TaskHandler.__init__`` and ``create_metrics_collector`` call this. + """ + if settings._subdir is not None: + return + if _env_bool("WORKER_CANONICAL_METRICS", default=False): + settings._subdir = _CANONICAL_SUBDIR + else: + settings._subdir = "" + + def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: """ Create the metrics collector selected by environment variables. - Sets ``settings.collector_subdir`` to ``"legacy"`` or ``"canonical"`` so - that ``settings.metrics_directory`` resolves to a type-specific - subdirectory. This is idempotent: calling the factory more than once on - the same *settings* object (e.g. once in the main process and again in - each forked worker) always produces the same directory. + Calls ``resolve_metrics_type`` to ensure ``settings.metrics_directory`` + returns the correct path, then instantiates the appropriate collector. Returns a fully-initialised collector (legacy or canonical) that satisfies the MetricsCollector Protocol and can be registered as an event listener. """ - collector_type = "canonical" if _env_bool("WORKER_CANONICAL_METRICS", default=False) else "legacy" + resolve_metrics_type(settings) - settings.collector_subdir = collector_type - os.makedirs(settings.metrics_directory, exist_ok=True) + metrics_dir = settings.metrics_directory + os.makedirs(metrics_dir, exist_ok=True) - is_owner = settings._owner_pid is None or os.getpid() == settings._owner_pid - if is_owner: - settings._owner_pid = os.getpid() + if metrics_dir not in _cleaned_directories: + _cleaned_directories.add(metrics_dir) if settings.clean_directory: settings._clean_stale_db_files() if settings.clean_dead_pids: settings._clean_dead_pid_files() - if collector_type == "canonical": + if settings._subdir == _CANONICAL_SUBDIR: from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector - logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector (dir=%s)", settings.metrics_directory) + logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector (dir=%s)", metrics_dir) return CanonicalMetricsCollector(settings) from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector - logger.info("Using LegacyMetricsCollector (dir=%s; set WORKER_CANONICAL_METRICS=true for canonical)", settings.metrics_directory) + logger.info("Using LegacyMetricsCollector (dir=%s; set WORKER_CANONICAL_METRICS=true for canonical)", metrics_dir) return LegacyMetricsCollector(settings) From 99c9457f23e00b993764a4313f469d2e81a48693 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 12:53:10 -0600 Subject: [PATCH 17/24] mostly doc fixes, minor tweak to sensible default --- CHANGELOG.md | 21 +--- METRICS.md | 113 +++++++++++++++--- .../telemetry/metrics_collector_base.py | 2 +- .../client/telemetry/metrics_factory.py | 7 +- 4 files changed, 106 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3471e6a72..b37cdf97d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,22 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` - - New `CanonicalMetricsCollector` emits the harmonized cross-SDK catalog using real Prometheus `Histogram`s for timing and size, replacing the legacy quantile-gauge timing shape. New canonical-only metrics: `task_poll_error_total`, `task_execution_started_total`, `task_result_size_bytes`, `workflow_input_size_bytes`, `http_api_client_request_seconds`, `active_workers`. Time buckets `0.001…10s`; size buckets `100…10_000_000` bytes. - - `metrics_factory.create_metrics_collector(settings)` selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive, whitespace-trimmed). `WORKER_LEGACY_METRICS` is documented but not yet read. - - New abstract `MetricsCollectorBase` consolidates Prometheus infrastructure (lazy `prometheus_client` imports, multiprocess `NoPidCollector` aggregation, HTTP server, exception-label cardinality bounding) and event handlers shared by both collectors. - - `(Async)TaskRunner` now records `task_update_time` (`status="SUCCESS"` / `"FAILURE"`) on every update path. - - `OrkesWorkflowClient.start_workflow*` calls `measure_workflow_input_payload_size` and `measure_workflow_start_error` on the collector; canonical records workflow input size and start errors, legacy no-ops (preserving existing behavior). `OrkesClients` / `OrkesBaseClient` accept an optional `metrics_collector`. - - Legacy metrics use the base directory unchanged from prior releases; canonical metrics use a `canonical/` subdirectory so that switching implementations never produces stale metric names. `MetricsSettings.metrics_directory` is a computed property that derives the correct path from `WORKER_CANONICAL_METRICS` without mutating state, ensuring consistent paths across all processes (main, workers, MetricsProvider). - - `MetricsSettings` gains `clean_directory` (default `False`) to wipe all `.db` files and `clean_dead_pids` (default `False`) to remove only `.db` files from PIDs that no longer exist. Both are executed by the factory against the resolved metrics directory. - - `CONDUCTOR_MP_START_METHOD` env var (`spawn` / `fork` / `forkserver`; default `fork` on POSIX, `spawn` on Windows) to control the worker pool's multiprocessing start method (motivated by a `prometheus_client` lock-fork deadlock). - - Harness manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.py` logs which collector is active. +- Canonical metrics mode: opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true` -- [details](METRICS.md#detailed-technical-notes--unreleased) +- `MetricsSettings` gains `clean_directory` and `clean_dead_pids` for opt-in stale `.db` file cleanup (both default to `False`) +- `CONDUCTOR_MP_START_METHOD` env var to control the worker pool's multiprocessing start method ### Changed -- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset - - `MetricLabel.PAYLOAD_TYPE` retains its original value `"payload_type"`; a new `PAYLOAD_TYPE_CAMEL = "payloadType"` constant is used only by the canonical collector on `external_payload_used_total`. - - `metrics_collector.py` is now a thin compatibility shim: `MetricsCollector = LegacyMetricsCollector`, so `from conductor.client.telemetry.metrics_collector import MetricsCollector` continues to work. - - Default behavior is unchanged: with no env var set, the legacy metric names, label conventions, and quantile-gauge timing shape from prior releases are preserved. - - Rewrote `METRICS.md` to document both surfaces, the env-var gate, full canonical and legacy catalogs, labels, a "Migrating From Legacy to Canonical" mapping (including the `payload_type` → `payloadType` label change and PromQL replacements), and troubleshooting. - - Updated `README.md`, `WORKER_CONFIGURATION.md`, and `docs/design/WORKER_DESIGN.md` to point at `METRICS.md`. +- Legacy metrics emit unchanged by default; no env var required +- `metrics_collector.py` is now a compatibility shim; `from conductor.client.telemetry.metrics_collector import MetricsCollector` continues to work diff --git a/METRICS.md b/METRICS.md index c0433c3b4..9e7f97706 100644 --- a/METRICS.md +++ b/METRICS.md @@ -62,8 +62,11 @@ metrics = MetricsSettings( ) ``` -`MetricsSettings` cleans stale Prometheus multiprocess `.db` files by default. -Use a dedicated metrics directory per worker process group. +`MetricsSettings` provides opt-in cleanup of stale Prometheus multiprocess +`.db` files. Pass `clean_directory=True` to remove all `.db` files on startup, +or `clean_dead_pids=True` to remove only files from PIDs that no longer exist. +Both default to `False`. Use a dedicated metrics directory per worker process +group. ## Selecting Canonical Metrics @@ -77,6 +80,11 @@ Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other value, or an unset variable, selects legacy metrics. The variable is read when the metrics collector is created, so changing it requires a worker restart. +`WORKER_LEGACY_METRICS` is reserved for future use. After the deprecation +period ends and canonical becomes the default, setting +`WORKER_LEGACY_METRICS=true` will allow opting back into legacy metrics. It is +not currently read. + ## Canonical Metrics Canonical timing values are seconds. Canonical size values are bytes. Label @@ -96,11 +104,11 @@ counters, may not appear in normal worker runs unless that path is exercised. | `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. | | `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. | | `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. | -| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. | -| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. | -| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. | -| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. | -| `thread_uncaught_exceptions_total` | `exception` | Collector surface for uncaught worker-thread exceptions. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. Not yet wired -- no runtime call site. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. Not yet wired -- no runtime call site. | +| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. Not yet wired -- no runtime call site. | +| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. Not yet wired -- no runtime call site. | +| `thread_uncaught_exceptions_total` | `exception` | Collector surface for uncaught worker-thread exceptions. Not yet wired -- no runtime call site. | | `worker_restart_total` | `taskType` | Python-only counter for TaskHandler subprocess restarts. | | `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used. | | `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | @@ -159,11 +167,11 @@ counters appear only when the corresponding code path records them. | `task_poll_total` | `taskType` | Incremented each time polling is done. | | `task_execute_error_total` | `taskType`, `exception` | Task execution errors. `exception` is `str(exception)`. | | `task_update_error_total` | `taskType`, `exception` | Task update errors. `exception` is `str(exception)`. | -| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. `exception` is `str(exception)`. | -| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. | -| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. | -| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. | -| `thread_uncaught_exceptions_total` | none | Collector surface for uncaught worker-thread exceptions. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. Not yet wired -- no runtime call site. `exception` is `str(exception)`. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. Not yet wired -- no runtime call site. | +| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. Not yet wired -- no runtime call site. | +| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. Not yet wired -- no runtime call site. | +| `thread_uncaught_exceptions_total` | none | Collector surface for uncaught worker-thread exceptions. Not yet wired -- no runtime call site. | | `worker_restart_total` | `taskType` | TaskHandler subprocess restarts. | | `external_payload_used_total` | `entityName`, `operation`, `payload_type` | External payload storage usage. | | `workflow_start_error_total` | `workflowType`, `exception` | Workflow start errors. `exception` is `str(exception)`. | @@ -183,9 +191,9 @@ Legacy mode does not emit `task_poll_error_total`, | `task_execute_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window execution latency quantiles. | | `task_execute_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window execution observation count. | | `task_execute_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window execution duration sum. | -| `task_update_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window task update latency quantiles. | -| `task_update_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window task update observation count. | -| `task_update_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window task update duration sum. | +| `task_update_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window task update latency quantiles. Never emitted -- no call sites existed on main. Legacy no-ops this metric; only canonical mode records task update time. | +| `task_update_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window task update observation count. Never emitted (see above). | +| `task_update_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window task update duration sum. Never emitted (see above). | | `http_api_client_request` | Quantile gauge | `method`, `uri`, `status`, `quantile` | Sliding-window API-client request latency quantiles. | | `http_api_client_request_count` | Gauge | `method`, `uri`, `status` | Sliding-window API-client request observation count. | | `http_api_client_request_sum` | Gauge | `method`, `uri`, `status` | Sliding-window API-client request duration sum. | @@ -203,7 +211,7 @@ Legacy mode does not emit `task_poll_error_total`, |---|---|---| | `taskType` | Worker metrics | Task definition name. | | `workflowType` | Workflow metrics | Workflow definition name. | -| `version` | `workflow_input_size`, `workflow_input_size_bytes` | Workflow version as a string. If absent, the SDK uses `1`. | +| `version` | `workflow_input_size`, `workflow_input_size_bytes` | Workflow version as a string. If absent, the label is an empty string. | | `status` | Task time metrics | `SUCCESS` or `FAILURE`. For HTTP metrics, the response code as a string, an exception `status` or `code`, or `error` when unavailable. | | `exception` | Error counters | Legacy uses `str(exception)`. Canonical uses the exception class name, such as `TimeoutError`. | | `entityName` | `external_payload_used_total` | Task type or workflow name associated with the external payload. | @@ -251,7 +259,7 @@ Common PromQL replacements: |---|---| | `task_poll_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` | | `task_execute_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` | -| `task_update_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_update_time_seconds_bucket[5m])))` | +| `task_update_time_seconds{quantile="0.95"}` (never emitted in legacy) | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_update_time_seconds_bucket[5m])))` | | `http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` | | `task_result_size` | `task_result_size_bytes_bucket`, `_count`, and `_sum` | | `workflow_input_size` | `workflow_input_size_bytes_bucket`, `_count`, and `_sum` | @@ -292,3 +300,74 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType) available at the generated API-client call site. - Avoid embedding user identifiers or unbounded values in task type, workflow type, or external payload labels. + +## Detailed Technical Notes -- Unreleased + +Implementation details, internal design decisions, and migration notes for the +unreleased metrics harmonization work. For a summary, see the project +[CHANGELOG](CHANGELOG.md). + +### Added + +- **Metrics harmonization** -- canonical metric surface aligned with the + cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` + - New `CanonicalMetricsCollector` emits the harmonized cross-SDK catalog using + real Prometheus `Histogram`s for timing and size, replacing the legacy + quantile-gauge timing shape. New canonical-only metrics: + `task_poll_error_total`, `task_execution_started_total`, + `task_result_size_bytes`, `workflow_input_size_bytes`, + `http_api_client_request_seconds`, `active_workers`. Time buckets + `0.001…10s`; size buckets `100…10_000_000` bytes. + - `metrics_factory.create_metrics_collector(settings)` selects + `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on + `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive, + whitespace-trimmed). `WORKER_LEGACY_METRICS` is reserved for future use and + is not currently read. + - New abstract `MetricsCollectorBase` consolidates Prometheus infrastructure + (lazy `prometheus_client` imports, multiprocess `NoPidCollector` aggregation, + HTTP server, exception-label cardinality bounding) and event handlers shared + by both collectors. + - `TaskRunner` and `AsyncTaskRunner` now record `task_update_time` + (`status="SUCCESS"` / `"FAILURE"`) on every update path. + - `OrkesWorkflowClient.start_workflow*` calls + `measure_workflow_input_payload_size` and `measure_workflow_start_error` on + the collector; canonical records workflow input size and start errors, legacy + no-ops (preserving existing behavior). `OrkesClients` / `OrkesBaseClient` + accept an optional `metrics_collector`. + - Legacy metrics use the base directory unchanged from prior releases; + canonical metrics use a `canonical/` subdirectory so that switching + implementations never produces stale metric names. + `MetricsSettings.metrics_directory` is a computed property that derives the + correct path from `WORKER_CANONICAL_METRICS`, ensuring consistent paths + across all processes (main, workers, MetricsProvider). + - `MetricsSettings` gains `clean_directory` (default `False`) to wipe all + `.db` files and `clean_dead_pids` (default `False`) to remove only `.db` + files from PIDs that no longer exist. Both are executed by the factory + against the resolved metrics directory. + - `CONDUCTOR_MP_START_METHOD` env var (`spawn` / `fork` / `forkserver`; + default `fork` on POSIX, `spawn` on Windows) to control the worker pool's + multiprocessing start method (motivated by a `prometheus_client` lock-fork + deadlock). + - Harness manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.py` + logs which collector is active. + +### Changed + +- **Metrics harmonization** -- defaults preserved; legacy metrics emit unchanged + when `WORKER_CANONICAL_METRICS` is unset + - `MetricLabel.PAYLOAD_TYPE` retains its original value `"payload_type"`; a + new `PAYLOAD_TYPE_CAMEL = "payloadType"` constant is used only by the + canonical collector on `external_payload_used_total`. + - `metrics_collector.py` is now a thin compatibility shim: + `MetricsCollector = LegacyMetricsCollector`, so + `from conductor.client.telemetry.metrics_collector import MetricsCollector` + continues to work. + - Default behavior is unchanged: with no env var set, the legacy metric names, + label conventions, and quantile-gauge timing shape from prior releases are + preserved. + - Rewrote `METRICS.md` to document both surfaces, the env-var gate, full + canonical and legacy catalogs, labels, a "Migrating From Legacy to + Canonical" mapping (including the `payload_type` → `payloadType` label + change and PromQL replacements), and troubleshooting. + - Updated `README.md`, `WORKER_CONFIGURATION.md`, and + `docs/design/WORKER_DESIGN.md` to point at `METRICS.md`. diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 374a6123b..4be672c74 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -486,7 +486,7 @@ def on_workflow_started(self, event: WorkflowStarted) -> None: self.increment_workflow_start_error(event.name, event.cause) def on_workflow_input_payload_size(self, event: WorkflowInputPayloadSize) -> None: - version_str = str(event.version) if event.version is not None else "1" + version_str = str(event.version) if event.version is not None else "" self.record_workflow_input_payload_size(event.name, version_str, event.size_bytes) def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index ebe7a58c5..628d12e03 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -3,10 +3,11 @@ environment variables. WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector - WORKER_LEGACY_METRICS=true -> LegacyMetricsCollector (default during deprecation) + (unset / any other value) -> LegacyMetricsCollector (default during deprecation) -If WORKER_CANONICAL_METRICS is true it takes priority regardless of the value -of WORKER_LEGACY_METRICS. +WORKER_LEGACY_METRICS is reserved for future use. After the deprecation +period ends and canonical becomes the default, setting WORKER_LEGACY_METRICS=true +will allow opting back into legacy metrics. It is not currently read. """ import logging From 443e31ddb72ed9e4ac9fc0ea33ac26381fcbb41d Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 12 May 2026 13:22:12 -0600 Subject: [PATCH 18/24] restore default version label in one legacy metric --- .../client/telemetry/legacy_metrics_collector.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index 1ce724058..d70a86a58 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -12,6 +12,7 @@ from conductor.client.configuration.settings.metrics_settings import MetricsSettings from conductor.client.telemetry import metrics_collector_base as _mcb from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase +from conductor.client.event.workflow_events import WorkflowInputPayloadSize from conductor.client.telemetry.model.metric_documentation import MetricDocumentation from conductor.client.telemetry.model.metric_label import MetricLabel from conductor.client.telemetry.model.metric_name import MetricName @@ -187,6 +188,18 @@ def record_api_request_time(self, method: str, uri: str, status: str, time_spent value=time_spent, ) + # ------------------------------------------------------------------ + # Event handler overrides + # ------------------------------------------------------------------ + + def on_workflow_input_payload_size(self, event: WorkflowInputPayloadSize) -> None: + # The original metrics_collector.py defaulted version to "1" when + # absent. The canonical standard across all SDKs is "" (empty + # string). We preserve "1" here so that legacy upgraders don't see + # a time-series label change. + version_str = str(event.version) if event.version is not None else "1" + self.record_workflow_input_payload_size(event.name, version_str, event.size_bytes) + # ------------------------------------------------------------------ # Sizes (last-value gauges) # ------------------------------------------------------------------ From 2038fdc0df7d7d0ea3768bbd8d81a9e31c592ade Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 19 May 2026 12:41:38 -0600 Subject: [PATCH 19/24] remove cruft --- src/conductor/client/telemetry/metrics_collector_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 4be672c74..50f91a8d8 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -5,7 +5,7 @@ import signal import threading import time -from typing import Any, Dict, List +from typing import Any, Dict # Lazy imports - these will be imported when first needed. # PROMETHEUS_MULTIPROC_DIR must be set before prometheus_client is imported. @@ -103,7 +103,6 @@ def __init__(self, settings: MetricsSettings): self.counters: Dict[str, Any] = {} self.gauges: Dict[str, Any] = {} self.histograms: Dict[str, Any] = {} - self.summaries: Dict[str, Any] = {} self.registry = None self.must_collect_metrics = False self._lock = threading.RLock() From 620b89508ae2c2ce1436340ca49fbc2577e41d49 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 1 Jun 2026 10:04:41 -0600 Subject: [PATCH 20/24] reset cleaned-directories-set in test teardown, check for specific process-not-existing exception in cleanup for pid-named db files --- .../settings/metrics_settings.py | 15 ++++++- .../client/telemetry/metrics_factory.py | 7 ++++ tests/unit/telemetry/test_metrics_factory.py | 41 ++++++++++++++++++- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index dab240f1d..b2fd37954 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -109,9 +109,20 @@ def _clean_dead_pid_files(self) -> None: pid = int(match.group(1)) try: os.kill(pid, 0) - except OSError: + except ProcessLookupError: + # ESRCH: no process owns this PID -> safe to remove its db file. try: os.remove(path) logger.debug("Removed dead-pid metrics file %s (pid %d)", path, pid) - except Exception as e: + except OSError as e: logger.debug("Could not remove dead-pid metrics db file %s: %s", path, e) + except OSError as e: + # Any non-ESRCH probe failure (commonly EPERM: the process is + # alive but owned by another user) -> keep the file; deleting it + # could corrupt a live worker's metrics in a shared directory. + logger.debug( + "Keeping metrics db file %s; pid %d probe returned a " + "non-ProcessLookupError OSError (process likely alive): %s", + path, pid, e, + ) + continue diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index 628d12e03..8a2aed960 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -35,6 +35,13 @@ def _env_bool(name: str, default: bool) -> bool: _cleaned_directories: set = set() +def _reset_cleaned_directories() -> None: + """Clear the per-process cleanup guard. Intended for tests so that each + test starts from a known-clean state and does not inherit or leak entries + via this module-level global.""" + _cleaned_directories.clear() + + def resolve_metrics_type(settings: MetricsSettings) -> None: """Read ``WORKER_CANONICAL_METRICS`` once and store the result on *settings*. diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py index ebe17864c..e366203b8 100644 --- a/tests/unit/telemetry/test_metrics_factory.py +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -6,9 +6,13 @@ import shutil import tempfile import unittest +from unittest import mock from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.telemetry.metrics_factory import create_metrics_collector +from conductor.client.telemetry.metrics_factory import ( + create_metrics_collector, + _reset_cleaned_directories, +) from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector @@ -16,6 +20,7 @@ class TestMetricsFactory(unittest.TestCase): def setUp(self): + _reset_cleaned_directories() self.metrics_dir = tempfile.mkdtemp() self.settings = MetricsSettings(directory=self.metrics_dir) self._saved_env = {} @@ -30,6 +35,7 @@ def tearDown(self): os.environ[key] = val if os.path.exists(self.metrics_dir): shutil.rmtree(self.metrics_dir) + _reset_cleaned_directories() def test_default_returns_legacy(self): """With no env vars set, factory returns LegacyMetricsCollector.""" @@ -126,6 +132,39 @@ def test_both_implementations_satisfy_same_interface(self): f"CanonicalMetricsCollector missing method: {method_name}", ) + def test_cleanup_runs_once_per_directory(self): + """clean_directory cleanup fires only on the first collector for a given + directory; a second collector on the same directory must skip it so that + live .db files written by existing workers are never wiped.""" + settings = MetricsSettings(directory=self.metrics_dir, clean_directory=True) + + with mock.patch.object( + MetricsSettings, "_clean_stale_db_files", autospec=True + ) as clean_spy: + create_metrics_collector(settings) + create_metrics_collector(settings) + + self.assertEqual( + clean_spy.call_count, + 1, + "stale-db cleanup must run exactly once per directory per process", + ) + + def test_cleanup_runs_again_after_reset(self): + """Resetting the per-process guard allows cleanup to run again, proving + the global state is what gates the skip (and that setUp/tearDown isolate + tests from each other).""" + settings = MetricsSettings(directory=self.metrics_dir, clean_directory=True) + + with mock.patch.object( + MetricsSettings, "_clean_stale_db_files", autospec=True + ) as clean_spy: + create_metrics_collector(settings) + _reset_cleaned_directories() + create_metrics_collector(settings) + + self.assertEqual(clean_spy.call_count, 2) + if __name__ == "__main__": unittest.main() From 8eaaed2e21bb817386e52dcd1332cc40666f2e2c Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 2 Jun 2026 08:10:32 -0600 Subject: [PATCH 21/24] avoid double serializing json by capturing request size in rest http client and adjusting surrounding calls to be able to pass the info back up. additional improvements based on pr feedback + tests --- src/conductor/client/http/api_client.py | 12 +++- src/conductor/client/http/async_api_client.py | 9 ++- src/conductor/client/http/rest.py | 56 +++++++++------- .../client/orkes/orkes_base_client.py | 8 ++- .../client/orkes/orkes_workflow_client.py | 43 +++++++----- src/conductor/client/orkes_clients.py | 9 ++- .../telemetry/canonical_metrics_collector.py | 3 +- .../workflow/executor/workflow_executor.py | 7 +- .../api_client/test_api_client_coverage.py | 34 ++++++++++ tests/unit/orkes/test_workflow_client.py | 20 +++--- tests/unit/telemetry/test_metrics_factory.py | 66 +++++++++++++++++++ 11 files changed, 208 insertions(+), 59 deletions(-) diff --git a/src/conductor/client/http/api_client.py b/src/conductor/client/http/api_client.py index 54b208dab..6ad613cbd 100644 --- a/src/conductor/client/http/api_client.py +++ b/src/conductor/client/http/api_client.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import base64 import datetime import logging @@ -6,7 +8,7 @@ import re import tempfile import time -from typing import Dict +from typing import Dict, Optional, TYPE_CHECKING import uuid import six @@ -20,6 +22,9 @@ from conductor.client.http.rest import AuthorizationException from conductor.client.http.thread import AwaitableThread +if TYPE_CHECKING: + from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase + logger = logging.getLogger( Configuration.get_logging_formatted_name( __name__ @@ -45,7 +50,7 @@ def __init__( header_name=None, header_value=None, cookie=None, - metrics_collector=None + metrics_collector: Optional[MetricsCollectorBase] = None ): if configuration is None: configuration = Configuration() @@ -188,7 +193,8 @@ def __call_api_no_retry( return (return_data) else: return (return_data, response_data.status, - response_data.getheaders()) + response_data.getheaders(), + getattr(response_data, 'request_body_size', 0)) def sanitize_for_serialization(self, obj): """Builds a JSON POST object. diff --git a/src/conductor/client/http/async_api_client.py b/src/conductor/client/http/async_api_client.py index 6b094d4bd..c290cc0eb 100644 --- a/src/conductor/client/http/async_api_client.py +++ b/src/conductor/client/http/async_api_client.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import base64 import datetime import logging @@ -6,7 +8,7 @@ import re import tempfile import time -from typing import Dict +from typing import Dict, Optional, TYPE_CHECKING import uuid import six @@ -19,6 +21,9 @@ from conductor.client.http import async_rest from conductor.client.http.async_rest import AuthorizationException +if TYPE_CHECKING: + from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase + logger = logging.getLogger( Configuration.get_logging_formatted_name( __name__ @@ -47,7 +52,7 @@ def __init__( header_name=None, header_value=None, cookie=None, - metrics_collector=None + metrics_collector: Optional[MetricsCollectorBase] = None ): if configuration is None: configuration = Configuration() diff --git a/src/conductor/client/http/rest.py b/src/conductor/client/http/rest.py index c82708f82..2acaf6978 100644 --- a/src/conductor/client/http/rest.py +++ b/src/conductor/client/http/rest.py @@ -29,6 +29,7 @@ def __init__(self, resp): self.reason = getattr(resp, 'reason_phrase', '') or self._get_reason_phrase(resp.status_code) self.data = resp.text # eagerly read body self.headers = resp.headers + self.request_body_size = 0 # Break httpx Response <-> BoundSyncStream reference cycle (issue #395) resp.stream = None try: @@ -219,36 +220,42 @@ def request(self, method, url, query_params=None, headers=None, "httpx client was closed before request; re-established a fresh client" ) + # Serialize the request body once, before the retry loop. + # The body never changes between attempts; only the httpx client does. + request_body = None + request_body_size = 0 + request_url = url + has_body = method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE'] + + if has_body: + if query_params: + request_url = url + '?' + urlencode(query_params) + if re.search('json', headers['Content-Type'], re.IGNORECASE) or isinstance(body, str): + request_body = '{}' + if body is not None: + request_body = json.dumps(body) + if isinstance(body, str): + request_body = request_body.strip('"') + request_body_size = len(request_body.encode('utf-8')) + else: + msg = """Cannot prepare a request message for provided + arguments. Please check that your arguments match + declared content type.""" + raise ApiException(status=0, reason=msg) + for attempt in range(2): # Snapshot the client we're about to use so that if the request # fails we can ask for a reset only if the client is still this # one (i.e. nobody else healed it in the meantime). client_at_send = self.connection try: - # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']: - request_url = url - if query_params: - request_url += '?' + urlencode(query_params) - if re.search('json', headers['Content-Type'], re.IGNORECASE) or isinstance(body, str): - request_body = '{}' - if body is not None: - request_body = json.dumps(body) - if isinstance(body, str): - request_body = request_body.strip('"') - r = client_at_send.request( - method, request_url, - content=request_body, - timeout=timeout, - headers=headers - ) - else: - # Cannot generate the request from given parameters - msg = """Cannot prepare a request message for provided - arguments. Please check that your arguments match - declared content type.""" - raise ApiException(status=0, reason=msg) - # For `GET`, `HEAD` + if has_body: + r = client_at_send.request( + method, request_url, + content=request_body, + timeout=timeout, + headers=headers + ) else: r = client_at_send.request( method, url, @@ -309,6 +316,7 @@ def request(self, method, url, query_params=None, headers=None, if _preload_content: r = RESTResponse(r) + r.request_body_size = request_body_size if r.status == 401 or r.status == 403: raise AuthorizationException(http_resp=r) diff --git a/src/conductor/client/orkes/orkes_base_client.py b/src/conductor/client/orkes/orkes_base_client.py index 3c5314ad8..229cb9dba 100644 --- a/src/conductor/client/orkes/orkes_base_client.py +++ b/src/conductor/client/orkes/orkes_base_client.py @@ -1,6 +1,12 @@ +from __future__ import annotations + import logging +from typing import Optional, TYPE_CHECKING from conductor.client.configuration.configuration import Configuration + +if TYPE_CHECKING: + from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase from conductor.client.http.api.application_resource_api import ApplicationResourceApi from conductor.client.http.api.authorization_resource_api import AuthorizationResourceApi from conductor.client.http.api.gateway_auth_resource_api import GatewayAuthResourceApi @@ -22,7 +28,7 @@ class OrkesBaseClient(object): - def __init__(self, configuration: Configuration, metrics_collector=None): + def __init__(self, configuration: Configuration, metrics_collector: Optional[MetricsCollectorBase] = None): self.metrics_collector = metrics_collector self.api_client = ApiClient(configuration, metrics_collector=metrics_collector) self.logger = logging.getLogger( diff --git a/src/conductor/client/orkes/orkes_workflow_client.py b/src/conductor/client/orkes/orkes_workflow_client.py index b75453cf1..d70377ab3 100644 --- a/src/conductor/client/orkes/orkes_workflow_client.py +++ b/src/conductor/client/orkes/orkes_workflow_client.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Optional, List, Dict +from typing import Optional, List, Dict, TYPE_CHECKING from conductor.client.configuration.configuration import Configuration from conductor.client.http.models import SkipTaskRequest, WorkflowStatus, \ @@ -14,12 +14,15 @@ from conductor.client.orkes.orkes_base_client import OrkesBaseClient from conductor.client.workflow_client import WorkflowClient +if TYPE_CHECKING: + from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase + class OrkesWorkflowClient(OrkesBaseClient, WorkflowClient): def __init__( self, configuration: Configuration, - metrics_collector=None, + metrics_collector: Optional[MetricsCollectorBase] = None, ): super(OrkesWorkflowClient, self).__init__(configuration, metrics_collector=metrics_collector) @@ -39,13 +42,10 @@ def start_workflow_by_name( if priority: kwargs.update({"priority": priority}) - if self.metrics_collector is not None: - try: - self.metrics_collector.measure_workflow_input_payload_size(name, version, input) - except Exception: - pass try: - return self.workflowResourceApi.start_workflow1(input, name, **kwargs) + data, _status, _headers, request_body_size = ( + self.workflowResourceApi.start_workflow1_with_http_info(input, name, **kwargs) + ) except Exception as e: if self.metrics_collector is not None: try: @@ -54,18 +54,20 @@ def start_workflow_by_name( pass raise - def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: if self.metrics_collector is not None: try: - self.metrics_collector.measure_workflow_input_payload_size( - start_workflow_request.name, - start_workflow_request.version, - getattr(start_workflow_request, "input", None), - ) + version_str = str(version) if version is not None else "" + self.metrics_collector.record_workflow_input_payload_size(name, version_str, request_body_size) except Exception: pass + + return data + + def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: try: - return self.workflowResourceApi.start_workflow(start_workflow_request) + data, _status, _headers, request_body_size = ( + self.workflowResourceApi.start_workflow_with_http_info(start_workflow_request) + ) except Exception as e: if self.metrics_collector is not None: try: @@ -74,6 +76,17 @@ def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: pass raise + if self.metrics_collector is not None: + try: + version_str = str(start_workflow_request.version) if start_workflow_request.version is not None else "" + self.metrics_collector.record_workflow_input_payload_size( + start_workflow_request.name, version_str, request_body_size, + ) + except Exception: + pass + + return data + def execute_workflow( self, start_workflow_request: StartWorkflowRequest, diff --git a/src/conductor/client/orkes_clients.py b/src/conductor/client/orkes_clients.py index 50c773907..92029fd8f 100644 --- a/src/conductor/client/orkes_clients.py +++ b/src/conductor/client/orkes_clients.py @@ -1,5 +1,12 @@ +from __future__ import annotations + +from typing import Optional, TYPE_CHECKING + from conductor.client.authorization_client import AuthorizationClient from conductor.client.configuration.configuration import Configuration + +if TYPE_CHECKING: + from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase from conductor.client.integration_client import IntegrationClient from conductor.client.metadata_client import MetadataClient from conductor.client.orkes.orkes_integration_client import OrkesIntegrationClient @@ -21,7 +28,7 @@ class OrkesClients: - def __init__(self, configuration: Configuration = None, metrics_collector=None): + def __init__(self, configuration: Configuration = None, metrics_collector: Optional[MetricsCollectorBase] = None): if configuration is None: configuration = Configuration() self.configuration = configuration diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index d5f18a147..fc6f2e8a4 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -194,12 +194,13 @@ def record_task_update_time(self, task_type: str, time_spent: float, status: str def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, metric_uri: str = None) -> None: + resolved_uri = (metric_uri or uri).replace("{tasktype}", "{taskType}") self._observe_histogram( name=MetricName.API_REQUEST_TIME_CANONICAL, documentation=MetricDocumentation.API_REQUEST_TIME_CANONICAL, labels={ MetricLabel.METHOD: method, - MetricLabel.URI: metric_uri or uri, + MetricLabel.URI: resolved_uri, MetricLabel.STATUS: status, }, value=time_spent, diff --git a/src/conductor/client/workflow/executor/workflow_executor.py b/src/conductor/client/workflow/executor/workflow_executor.py index c8ff1f939..d7a386f71 100644 --- a/src/conductor/client/workflow/executor/workflow_executor.py +++ b/src/conductor/client/workflow/executor/workflow_executor.py @@ -1,6 +1,6 @@ from __future__ import annotations import uuid -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, TYPE_CHECKING from typing_extensions import Self @@ -25,9 +25,12 @@ ) from conductor.client.orkes.orkes_workflow_client import OrkesWorkflowClient +if TYPE_CHECKING: + from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase + class WorkflowExecutor: - def __init__(self, configuration: Configuration, metrics_collector=None) -> Self: + def __init__(self, configuration: Configuration, metrics_collector: Optional[MetricsCollectorBase] = None) -> Self: api_client = ApiClient(configuration, metrics_collector=metrics_collector) self.metadata_client = MetadataResourceApi(api_client) self.task_client = TaskResourceApi(api_client) diff --git a/tests/unit/api_client/test_api_client_coverage.py b/tests/unit/api_client/test_api_client_coverage.py index 7ce0ffdca..6b8a3d690 100644 --- a/tests/unit/api_client/test_api_client_coverage.py +++ b/tests/unit/api_client/test_api_client_coverage.py @@ -683,6 +683,40 @@ def test_request_with_metrics_collector(self): self.assertEqual(call_args[1]['method'], 'GET') self.assertEqual(call_args[1]['status'], '200') + def test_request_passes_metric_uri_to_collector(self): + """metric_uri kwarg is forwarded to record_api_request_time on success.""" + metrics_collector = Mock() + with patch.object(ApiClient, '_ApiClient__refresh_auth_token'): + client = ApiClient(configuration=self.config, metrics_collector=metrics_collector) + + with patch.object(client.rest_client, 'GET', return_value=Mock(status=200)): + client.request( + 'GET', 'http://localhost:8080/api/workflow/test-id', + metric_uri='/api/workflow/{workflowId}', + ) + + call_args = metrics_collector.record_api_request_time.call_args + self.assertEqual(call_args[1]['metric_uri'], '/api/workflow/{workflowId}') + + def test_request_passes_metric_uri_to_collector_on_error(self): + """metric_uri kwarg is forwarded to record_api_request_time on error.""" + metrics_collector = Mock() + with patch.object(ApiClient, '_ApiClient__refresh_auth_token'): + client = ApiClient(configuration=self.config, metrics_collector=metrics_collector) + + error = Exception('Test error') + error.status = 500 + + with patch.object(client.rest_client, 'GET', side_effect=error): + with self.assertRaises(Exception): + client.request( + 'GET', 'http://localhost:8080/api/workflow/test-id', + metric_uri='/api/workflow/{workflowId}', + ) + + call_args = metrics_collector.record_api_request_time.call_args + self.assertEqual(call_args[1]['metric_uri'], '/api/workflow/{workflowId}') + def test_request_with_metrics_collector_on_error(self): """Test request method with metrics collector on error""" metrics_collector = Mock() diff --git a/tests/unit/orkes/test_workflow_client.py b/tests/unit/orkes/test_workflow_client.py index 294db6b02..75fd8701e 100644 --- a/tests/unit/orkes/test_workflow_client.py +++ b/tests/unit/orkes/test_workflow_client.py @@ -39,37 +39,37 @@ def test_init(self): message = "workflowResourceApi is not of type WorkflowResourceApi" self.assertIsInstance(self.workflow_client.workflowResourceApi, WorkflowResourceApi, message) - @patch.object(WorkflowResourceApi, 'start_workflow1') + @patch.object(WorkflowResourceApi, 'start_workflow1_with_http_info') def test_startWorkflowByName(self, mock): - mock.return_value = WORKFLOW_UUID + mock.return_value = (WORKFLOW_UUID, 200, {}, 42) wfId = self.workflow_client.start_workflow_by_name(WORKFLOW_NAME, self.input) mock.assert_called_with(self.input, WORKFLOW_NAME) self.assertEqual(wfId, WORKFLOW_UUID) - @patch.object(WorkflowResourceApi, 'start_workflow1') + @patch.object(WorkflowResourceApi, 'start_workflow1_with_http_info') def test_startWorkflowByName_with_version(self, mock): - mock.return_value = WORKFLOW_UUID + mock.return_value = (WORKFLOW_UUID, 200, {}, 42) wfId = self.workflow_client.start_workflow_by_name(WORKFLOW_NAME, self.input, version=1) mock.assert_called_with(self.input, WORKFLOW_NAME, version=1) self.assertEqual(wfId, WORKFLOW_UUID) - @patch.object(WorkflowResourceApi, 'start_workflow1') + @patch.object(WorkflowResourceApi, 'start_workflow1_with_http_info') def test_startWorkflowByName_with_correlation_id(self, mock): - mock.return_value = WORKFLOW_UUID + mock.return_value = (WORKFLOW_UUID, 200, {}, 42) wfId = self.workflow_client.start_workflow_by_name(WORKFLOW_NAME, self.input, correlationId=CORRELATION_ID) mock.assert_called_with(self.input, WORKFLOW_NAME, correlation_id=CORRELATION_ID) self.assertEqual(wfId, WORKFLOW_UUID) - @patch.object(WorkflowResourceApi, 'start_workflow1') + @patch.object(WorkflowResourceApi, 'start_workflow1_with_http_info') def test_startWorkflowByName_with_version_and_priority(self, mock): - mock.return_value = WORKFLOW_UUID + mock.return_value = (WORKFLOW_UUID, 200, {}, 42) wfId = self.workflow_client.start_workflow_by_name(WORKFLOW_NAME, self.input, version=1, priority=1) mock.assert_called_with(self.input, WORKFLOW_NAME, version=1, priority=1) self.assertEqual(wfId, WORKFLOW_UUID) - @patch.object(WorkflowResourceApi, 'start_workflow') + @patch.object(WorkflowResourceApi, 'start_workflow_with_http_info') def test_startWorkflow(self, mock): - mock.return_value = WORKFLOW_UUID + mock.return_value = (WORKFLOW_UUID, 200, {}, 42) startWorkflowReq = StartWorkflowRequest() wfId = self.workflow_client.start_workflow(startWorkflowReq) mock.assert_called_with(startWorkflowReq) diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py index e366203b8..7fc77b5cf 100644 --- a/tests/unit/telemetry/test_metrics_factory.py +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -54,6 +54,12 @@ def test_canonical_1_returns_canonical(self): collector = create_metrics_collector(self.settings) self.assertIsInstance(collector, CanonicalMetricsCollector) + def test_canonical_yes_returns_canonical(self): + """WORKER_CANONICAL_METRICS=yes selects CanonicalMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "yes" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + def test_canonical_false_returns_legacy(self): """WORKER_CANONICAL_METRICS=false selects LegacyMetricsCollector.""" os.environ["WORKER_CANONICAL_METRICS"] = "false" @@ -165,6 +171,66 @@ def test_cleanup_runs_again_after_reset(self): self.assertEqual(clean_spy.call_count, 2) + def test_clean_directory_removes_db_files(self): + """clean_directory=True actually removes .db files from the metrics dir.""" + db_path = os.path.join(self.metrics_dir, "gauge_livesum_12345.db") + with open(db_path, "w") as f: + f.write("fake") + + settings = MetricsSettings(directory=self.metrics_dir, clean_directory=True) + create_metrics_collector(settings) + + self.assertFalse( + os.path.exists(db_path), + "clean_directory=True should remove existing .db files", + ) + + def test_clean_dead_pids_removes_dead_pid_file(self): + """clean_dead_pids=True removes .db files whose PID no longer exists.""" + dead_pid = 2_000_000_000 + db_path = os.path.join(self.metrics_dir, f"gauge_livesum_{dead_pid}.db") + with open(db_path, "w") as f: + f.write("fake") + + settings = MetricsSettings(directory=self.metrics_dir, clean_dead_pids=True) + create_metrics_collector(settings) + + self.assertFalse( + os.path.exists(db_path), + "clean_dead_pids=True should remove .db file for a non-existent PID", + ) + + def test_clean_dead_pids_keeps_live_pid_file(self): + """clean_dead_pids=True keeps .db files whose PID is still alive.""" + live_pid = os.getpid() + db_path = os.path.join(self.metrics_dir, f"gauge_livesum_{live_pid}.db") + with open(db_path, "w") as f: + f.write("fake") + + settings = MetricsSettings(directory=self.metrics_dir, clean_dead_pids=True) + create_metrics_collector(settings) + + self.assertTrue( + os.path.exists(db_path), + "clean_dead_pids=True must keep .db files for live PIDs", + ) + + @mock.patch("os.kill", side_effect=PermissionError("Operation not permitted")) + def test_clean_dead_pids_keeps_file_on_permission_error(self, _mock_kill): + """PermissionError from os.kill means the process is alive but owned + by another user -- the .db file must be kept.""" + db_path = os.path.join(self.metrics_dir, "gauge_livesum_99999.db") + with open(db_path, "w") as f: + f.write("fake") + + settings = MetricsSettings(directory=self.metrics_dir, clean_dead_pids=True) + create_metrics_collector(settings) + + self.assertTrue( + os.path.exists(db_path), + "PermissionError should be treated as 'process alive' -- file must be kept", + ) + if __name__ == "__main__": unittest.main() From 08e1725326628f47f4a1b848db7e447fa15316d3 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 2 Jun 2026 08:42:16 -0600 Subject: [PATCH 22/24] determine metrics dir early --- .../client/automator/task_handler.py | 7 +-- .../settings/metrics_settings.py | 33 +++++++++-- .../telemetry/canonical_metrics_collector.py | 4 +- .../telemetry/legacy_metrics_collector.py | 4 +- .../telemetry/metrics_collector_base.py | 4 +- .../client/telemetry/metrics_factory.py | 37 ++---------- .../unit/api_client/test_async_api_client.py | 58 +++++++++++++++++++ tests/unit/telemetry/test_metrics_factory.py | 31 +++++----- 8 files changed, 114 insertions(+), 64 deletions(-) create mode 100644 tests/unit/api_client/test_async_api_client.py diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index 8dcf5ba09..27a0e8877 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -232,12 +232,9 @@ def __init__( self._configuration = configuration self._metrics_settings = metrics_settings - # Resolve the metrics subdirectory and set PROMETHEUS_MULTIPROC_DIR - # BEFORE any worker processes start. resolve_metrics_type is - # idempotent so it's safe if the caller already called it. + # Set PROMETHEUS_MULTIPROC_DIR BEFORE any worker processes start. + # MetricsSettings resolves the subdirectory eagerly at construction. if metrics_settings is not None: - from conductor.client.telemetry.metrics_factory import resolve_metrics_type - resolve_metrics_type(metrics_settings) os.environ["PROMETHEUS_MULTIPROC_DIR"] = metrics_settings.metrics_directory logger.info(f"Set PROMETHEUS_MULTIPROC_DIR={metrics_settings.metrics_directory}") diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index b2fd37954..cdab29c36 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -14,6 +14,16 @@ ) +CANONICAL_SUBDIR = "canonical" + + +def _env_bool(name: str, default: bool) -> bool: + value = os.environ.get(name, "") + if not value: + return default + return value.strip().lower() in ("true", "1", "yes") + + def get_default_temporary_folder() -> str: return f"{Path.home()!s}/tmp/" @@ -30,6 +40,11 @@ def __init__( """ Configure metrics collection settings. + The ``WORKER_CANONICAL_METRICS`` env var is read at construction time + to decide whether ``.db`` files go in *directory* (legacy) or in a + ``canonical/`` subdirectory (canonical mode). Set the env var before + creating this object. + Args: directory: Base directory for storing multiprocess metrics .db files. Legacy metrics use this directory directly (unchanged from @@ -62,16 +77,24 @@ def __init__( self.http_port = http_port self.clean_directory = clean_directory self.clean_dead_pids = clean_dead_pids - self._subdir: Optional[str] = None + self._subdir: str = ( + CANONICAL_SUBDIR + if _env_bool("WORKER_CANONICAL_METRICS", False) + else "" + ) + + @property + def is_canonical(self) -> bool: + return self._subdir == CANONICAL_SUBDIR @property def metrics_directory(self) -> str: """Full path where .db files live (base directory + optional subdir). - The subdirectory is set by ``metrics_factory.resolve_metrics_type`` - before any child processes are forked. Legacy leaves it empty (base - directory unchanged from prior releases); canonical sets it to - ``"canonical"`` to avoid stale metric-name collisions. + Legacy leaves the subdirectory empty (base directory unchanged from + prior releases); canonical sets it to ``"canonical"`` to avoid stale + metric-name collisions. Resolved eagerly at construction time from + the ``WORKER_CANONICAL_METRICS`` env var. """ if self._subdir: return os.path.join(self.directory, self._subdir) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index fc6f2e8a4..e21c3fc94 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -11,6 +11,8 @@ WORKER_CANONICAL_METRICS=true. """ +from typing import Optional + from conductor.client.configuration.settings.metrics_settings import MetricsSettings from conductor.client.telemetry.metrics_collector_base import ( MetricsCollectorBase, @@ -193,7 +195,7 @@ def record_task_update_time(self, task_type: str, time_spent: float, status: str ) def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, - metric_uri: str = None) -> None: + metric_uri: Optional[str] = None) -> None: resolved_uri = (metric_uri or uri).replace("{tasktype}", "{taskType}") self._observe_histogram( name=MetricName.API_REQUEST_TIME_CANONICAL, diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index d70a86a58..88636272d 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -7,7 +7,7 @@ """ from collections import deque -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from conductor.client.configuration.settings.metrics_settings import MetricsSettings from conductor.client.telemetry import metrics_collector_base as _mcb @@ -176,7 +176,7 @@ def record_task_update_time(self, task_type: str, time_spent: float, status: str pass # canonical-only; call sites were added in this branch and didn't exist on main def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, - metric_uri: str = None) -> None: + metric_uri: Optional[str] = None) -> None: self._record_quantiles( name=MetricName.API_REQUEST_TIME, documentation=MetricDocumentation.API_REQUEST_TIME, diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index 50f91a8d8..d91f3fe6b 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -5,7 +5,7 @@ import signal import threading import time -from typing import Any, Dict +from typing import Any, Dict, Optional # Lazy imports - these will be imported when first needed. # PROMETHEUS_MULTIPROC_DIR must be set before prometheus_client is imported. @@ -419,7 +419,7 @@ def record_task_update_time(self, task_type: str, time_spent: float, status: str @abc.abstractmethod def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float, - metric_uri: str = None) -> None: ... + metric_uri: Optional[str] = None) -> None: ... @abc.abstractmethod def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: ... diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py index 8a2aed960..8523a2018 100644 --- a/src/conductor/client/telemetry/metrics_factory.py +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -22,16 +22,6 @@ ) -_CANONICAL_SUBDIR = "canonical" - - -def _env_bool(name: str, default: bool) -> bool: - value = os.environ.get(name, "") - if not value: - return default - return value.strip().lower() in ("true", "1", "yes") - - _cleaned_directories: set = set() @@ -42,35 +32,16 @@ def _reset_cleaned_directories() -> None: _cleaned_directories.clear() -def resolve_metrics_type(settings: MetricsSettings) -> None: - """Read ``WORKER_CANONICAL_METRICS`` once and store the result on *settings*. - - Idempotent -- subsequent calls are no-ops. Must be called before - ``settings.metrics_directory`` is read so that the subdirectory is - resolved in the main process before any child processes are forked. - - Both ``TaskHandler.__init__`` and ``create_metrics_collector`` call this. - """ - if settings._subdir is not None: - return - if _env_bool("WORKER_CANONICAL_METRICS", default=False): - settings._subdir = _CANONICAL_SUBDIR - else: - settings._subdir = "" - - def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: """ - Create the metrics collector selected by environment variables. + Create the metrics collector indicated by ``settings.is_canonical``. - Calls ``resolve_metrics_type`` to ensure ``settings.metrics_directory`` - returns the correct path, then instantiates the appropriate collector. + ``MetricsSettings`` reads ``WORKER_CANONICAL_METRICS`` at construction + time, so the directory and collector type are already determined. Returns a fully-initialised collector (legacy or canonical) that satisfies the MetricsCollector Protocol and can be registered as an event listener. """ - resolve_metrics_type(settings) - metrics_dir = settings.metrics_directory os.makedirs(metrics_dir, exist_ok=True) @@ -81,7 +52,7 @@ def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: if settings.clean_dead_pids: settings._clean_dead_pid_files() - if settings._subdir == _CANONICAL_SUBDIR: + if settings.is_canonical: from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector (dir=%s)", metrics_dir) return CanonicalMetricsCollector(settings) diff --git a/tests/unit/api_client/test_async_api_client.py b/tests/unit/api_client/test_async_api_client.py new file mode 100644 index 000000000..b75b59351 --- /dev/null +++ b/tests/unit/api_client/test_async_api_client.py @@ -0,0 +1,58 @@ +import asyncio +import unittest +from unittest.mock import Mock, AsyncMock + +from conductor.client.http.async_api_client import AsyncApiClient +from conductor.client.configuration.configuration import Configuration + + +class TestAsyncApiClientMetricUri(unittest.TestCase): + + def setUp(self): + self.config = Configuration( + base_url="http://localhost:8080", + authentication_settings=None, + ) + + def _run(self, coro): + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + def test_request_passes_metric_uri_to_collector(self): + """metric_uri kwarg is forwarded to record_api_request_time on success.""" + metrics_collector = Mock() + client = AsyncApiClient(configuration=self.config, metrics_collector=metrics_collector) + client.async_rest_client.GET = AsyncMock(return_value=Mock(status=200)) + + self._run(client.request( + 'GET', 'http://localhost:8080/api/workflow/test-id', + metric_uri='/api/workflow/{workflowId}', + )) + + call_args = metrics_collector.record_api_request_time.call_args + self.assertEqual(call_args[1]['metric_uri'], '/api/workflow/{workflowId}') + + def test_request_passes_metric_uri_to_collector_on_error(self): + """metric_uri kwarg is forwarded to record_api_request_time on error.""" + metrics_collector = Mock() + client = AsyncApiClient(configuration=self.config, metrics_collector=metrics_collector) + + error = Exception('Test error') + error.status = 500 + client.async_rest_client.GET = AsyncMock(side_effect=error) + + with self.assertRaises(Exception): + self._run(client.request( + 'GET', 'http://localhost:8080/api/workflow/test-id', + metric_uri='/api/workflow/{workflowId}', + )) + + call_args = metrics_collector.record_api_request_time.call_args + self.assertEqual(call_args[1]['metric_uri'], '/api/workflow/{workflowId}') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py index 7fc77b5cf..1a324245b 100644 --- a/tests/unit/telemetry/test_metrics_factory.py +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -21,11 +21,10 @@ class TestMetricsFactory(unittest.TestCase): def setUp(self): _reset_cleaned_directories() - self.metrics_dir = tempfile.mkdtemp() - self.settings = MetricsSettings(directory=self.metrics_dir) self._saved_env = {} for key in ("WORKER_CANONICAL_METRICS", "WORKER_LEGACY_METRICS"): self._saved_env[key] = os.environ.pop(key, None) + self.metrics_dir = tempfile.mkdtemp() def tearDown(self): for key, val in self._saved_env.items(): @@ -37,62 +36,62 @@ def tearDown(self): shutil.rmtree(self.metrics_dir) _reset_cleaned_directories() + def _make_settings(self, **kwargs): + kwargs.setdefault("directory", self.metrics_dir) + return MetricsSettings(**kwargs) + def test_default_returns_legacy(self): """With no env vars set, factory returns LegacyMetricsCollector.""" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertIsInstance(collector, LegacyMetricsCollector) def test_canonical_true_returns_canonical(self): """WORKER_CANONICAL_METRICS=true selects CanonicalMetricsCollector.""" os.environ["WORKER_CANONICAL_METRICS"] = "true" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertIsInstance(collector, CanonicalMetricsCollector) def test_canonical_1_returns_canonical(self): """WORKER_CANONICAL_METRICS=1 selects CanonicalMetricsCollector.""" os.environ["WORKER_CANONICAL_METRICS"] = "1" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertIsInstance(collector, CanonicalMetricsCollector) def test_canonical_yes_returns_canonical(self): """WORKER_CANONICAL_METRICS=yes selects CanonicalMetricsCollector.""" os.environ["WORKER_CANONICAL_METRICS"] = "yes" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertIsInstance(collector, CanonicalMetricsCollector) def test_canonical_false_returns_legacy(self): """WORKER_CANONICAL_METRICS=false selects LegacyMetricsCollector.""" os.environ["WORKER_CANONICAL_METRICS"] = "false" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertIsInstance(collector, LegacyMetricsCollector) def test_canonical_takes_priority_over_legacy(self): """WORKER_CANONICAL_METRICS=true wins even if WORKER_LEGACY_METRICS=true.""" os.environ["WORKER_CANONICAL_METRICS"] = "true" os.environ["WORKER_LEGACY_METRICS"] = "true" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertIsInstance(collector, CanonicalMetricsCollector) def test_legacy_collector_name(self): """LegacyMetricsCollector.collector_name() returns 'legacy'.""" - collector = create_metrics_collector(self.settings) + collector = create_metrics_collector(self._make_settings()) self.assertEqual(collector.collector_name(), "legacy") def test_canonical_collector_name(self): """CanonicalMetricsCollector.collector_name() returns 'canonical'.""" os.environ["WORKER_CANONICAL_METRICS"] = "true" - collector = create_metrics_collector( - MetricsSettings(directory=tempfile.mkdtemp()) - ) + collector = create_metrics_collector(self._make_settings()) self.assertEqual(collector.collector_name(), "canonical") def test_both_implementations_satisfy_same_interface(self): """Both implementations have the same public method surface.""" - legacy = LegacyMetricsCollector(self.settings) + legacy = LegacyMetricsCollector(self._make_settings()) os.environ["WORKER_CANONICAL_METRICS"] = "true" - canonical = create_metrics_collector( - MetricsSettings(directory=tempfile.mkdtemp()) - ) + canonical = create_metrics_collector(self._make_settings()) required_methods = [ "collector_name", From 290911e84af18a6dc5376e2623374305fcbd81bb Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 2 Jun 2026 09:02:08 -0600 Subject: [PATCH 23/24] adjust some help labels that were piggybacking on legacy for canonical --- src/conductor/client/telemetry/canonical_metrics_collector.py | 4 ++-- src/conductor/client/telemetry/model/metric_documentation.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index e21c3fc94..187875655 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -170,7 +170,7 @@ def set_active_workers(self, task_type: str, count: int) -> None: def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: self._observe_histogram( name=MetricName.TASK_POLL_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM_CANONICAL, labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, value=time_spent, buckets=TIME_BUCKETS, @@ -179,7 +179,7 @@ def record_task_poll_time(self, task_type: str, time_spent: float, status: str = def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: self._observe_histogram( name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM_CANONICAL, labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, value=time_spent, buckets=TIME_BUCKETS, diff --git a/src/conductor/client/telemetry/model/metric_documentation.py b/src/conductor/client/telemetry/model/metric_documentation.py index 47d8c2736..89af6bac9 100644 --- a/src/conductor/client/telemetry/model/metric_documentation.py +++ b/src/conductor/client/telemetry/model/metric_documentation.py @@ -17,7 +17,7 @@ class MetricDocumentation(str, Enum): TASK_POLL_TIME_HISTOGRAM = "Task poll duration in seconds with quantiles" TASK_RESULT_SIZE = "Records output payload size of a task" TASK_UPDATE_ERROR = "Task status cannot be updated back to server" - TASK_UPDATE_TIME_HISTOGRAM = "Task update duration in seconds with quantiles" + TASK_UPDATE_TIME_HISTOGRAM = "Latency of task result updates in seconds" THREAD_UNCAUGHT_EXCEPTION = "thread_uncaught_exceptions" WORKER_RESTART = "Worker subprocess restarted" WORKFLOW_START_ERROR = "Counter for workflow start errors" @@ -25,6 +25,8 @@ class MetricDocumentation(str, Enum): # Canonical-only documentation strings API_REQUEST_TIME_CANONICAL = "Latency of HTTP API client requests in seconds" + TASK_POLL_TIME_HISTOGRAM_CANONICAL = "Latency of task poll requests in seconds" + TASK_EXECUTE_TIME_HISTOGRAM_CANONICAL = "Duration of worker task execution in seconds" TASK_EXECUTION_STARTED = "Incremented when a polled task is dispatched to the worker function" TASK_POLL_ERROR = "Incremented when a poll request fails client-side" TASK_RESULT_SIZE_BYTES = "Serialized byte size of task result output" From 372ac78b34474527375c70c378e18aa99385029b Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 2 Jun 2026 09:09:40 -0600 Subject: [PATCH 24/24] group constants better --- src/conductor/client/telemetry/model/metric_documentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conductor/client/telemetry/model/metric_documentation.py b/src/conductor/client/telemetry/model/metric_documentation.py index 89af6bac9..7f2fa4ec4 100644 --- a/src/conductor/client/telemetry/model/metric_documentation.py +++ b/src/conductor/client/telemetry/model/metric_documentation.py @@ -17,7 +17,6 @@ class MetricDocumentation(str, Enum): TASK_POLL_TIME_HISTOGRAM = "Task poll duration in seconds with quantiles" TASK_RESULT_SIZE = "Records output payload size of a task" TASK_UPDATE_ERROR = "Task status cannot be updated back to server" - TASK_UPDATE_TIME_HISTOGRAM = "Latency of task result updates in seconds" THREAD_UNCAUGHT_EXCEPTION = "thread_uncaught_exceptions" WORKER_RESTART = "Worker subprocess restarted" WORKFLOW_START_ERROR = "Counter for workflow start errors" @@ -27,6 +26,7 @@ class MetricDocumentation(str, Enum): API_REQUEST_TIME_CANONICAL = "Latency of HTTP API client requests in seconds" TASK_POLL_TIME_HISTOGRAM_CANONICAL = "Latency of task poll requests in seconds" TASK_EXECUTE_TIME_HISTOGRAM_CANONICAL = "Duration of worker task execution in seconds" + TASK_UPDATE_TIME_HISTOGRAM = "Latency of task result updates in seconds" TASK_EXECUTION_STARTED = "Incremented when a polled task is dispatched to the worker function" TASK_POLL_ERROR = "Incremented when a poll request fails client-side" TASK_RESULT_SIZE_BYTES = "Serialized byte size of task result output"