From 34dc0a9c22ca916834384954b45b1c8c0922d38f Mon Sep 17 00:00:00 2001 From: Morgan Wowk Date: Tue, 17 Mar 2026 21:20:35 -0700 Subject: [PATCH] feat: Measure execution status duration **Changes:** * Adds histogram measurement for execution node status duration without adding additional database load to the system --- cloud_pipelines_backend/backend_types_sql.py | 3 + .../instrumentation/metrics.py | 19 ++++- .../sql_event_listeners.py | 76 +++++++++++++++++++ start_local.py | 1 + tests/test_sql_event_listeners.py | 49 ++++++++++++ 5 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 cloud_pipelines_backend/sql_event_listeners.py create mode 100644 tests/test_sql_event_listeners.py diff --git a/cloud_pipelines_backend/backend_types_sql.py b/cloud_pipelines_backend/backend_types_sql.py index b984c6c..636da28 100644 --- a/cloud_pipelines_backend/backend_types_sql.py +++ b/cloud_pipelines_backend/backend_types_sql.py @@ -424,7 +424,10 @@ class ExecutionNode(_TableBase): repr=False, ) + _status_changed: bool = dataclasses.field(default=False, init=False, repr=False) + +EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY = "container_execution_status_history" EXECUTION_NODE_EXTRA_DATA_SYSTEM_ERROR_EXCEPTION_MESSAGE_KEY = ( "system_error_exception_message" ) diff --git a/cloud_pipelines_backend/instrumentation/metrics.py b/cloud_pipelines_backend/instrumentation/metrics.py index 5ddb326..71b79ef 100644 --- a/cloud_pipelines_backend/instrumentation/metrics.py +++ b/cloud_pipelines_backend/instrumentation/metrics.py @@ -22,8 +22,18 @@ - Instrument: orchestrator_execution_system_errors """ +import enum + from opentelemetry import metrics as otel_metrics + +class MetricUnit(str, enum.Enum): + """UCUM-style unit strings accepted by the OTel SDK.""" + + SECONDS = "s" + ERRORS = "{error}" + + # --------------------------------------------------------------------------- # tangle.orchestrator # --------------------------------------------------------------------------- @@ -32,5 +42,12 @@ execution_system_errors = orchestrator_meter.create_counter( name="execution.system_errors", description="Number of execution nodes that ended in SYSTEM_ERROR status", - unit="{error}", + unit=MetricUnit.ERRORS, +) + +execution_status_transition_duration = orchestrator_meter.create_histogram( + name="execution.status_transition.duration", + description="Duration an execution spent in a status before transitioning to the next status", + unit=MetricUnit.SECONDS, ) + diff --git a/cloud_pipelines_backend/sql_event_listeners.py b/cloud_pipelines_backend/sql_event_listeners.py new file mode 100644 index 0000000..e48c855 --- /dev/null +++ b/cloud_pipelines_backend/sql_event_listeners.py @@ -0,0 +1,76 @@ +"""SQLAlchemy event listeners for cloud_pipelines_backend models. + +This module registers global SQLAlchemy event hooks. It must be imported at +application startup (start_local.py, orchestrator_main_oasis.py, etc.) for the +listeners to take effect. +""" + +import datetime +import logging +import typing + +from sqlalchemy import event as sql_event +from sqlalchemy import orm + +from . import backend_types_sql +from .instrumentation import metrics + +_logger = logging.getLogger(__name__) + + +@sql_event.listens_for(backend_types_sql.ExecutionNode.container_execution_status, "set") +def _handle_container_execution_status_set( + execution: backend_types_sql.ExecutionNode, + value: typing.Any, + _old_value: typing.Any, + _initiator: typing.Any, +) -> None: + if value is None: + return + if execution.extra_data is None: + execution.extra_data = {} + history: list = execution.extra_data.get( + backend_types_sql.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY, [] + ) + entry = { + "status": value.value, + "first_observed_at": datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ), + } + execution.extra_data = { + **execution.extra_data, + backend_types_sql.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY: history + [entry], + } + execution._status_changed = True + + +@sql_event.listens_for(orm.Session, "before_commit") +def _handle_before_commit(session: orm.Session) -> None: + for obj in list(session.new) + list(session.dirty): + if not isinstance(obj, backend_types_sql.ExecutionNode): + continue + if not obj._status_changed: + continue + history: list = (obj.extra_data or {}).get( + backend_types_sql.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY, [] + ) + if len(history) >= 2: + prev = history[-2] + curr = history[-1] + prev_time = datetime.datetime.fromisoformat(prev["first_observed_at"]) + curr_time = datetime.datetime.fromisoformat(curr["first_observed_at"]) + try: + metrics.execution_status_transition_duration.record( + (curr_time - prev_time).total_seconds(), + attributes={ + "execution.status.from": prev["status"], + "execution.status.to": curr["status"], + }, + ) + except Exception: + _logger.warning( + f"Failed to record status transition metric for execution {obj.id!r}", + exc_info=True, + ) + obj._status_changed = False diff --git a/start_local.py b/start_local.py index 4ea8921..bddf735 100644 --- a/start_local.py +++ b/start_local.py @@ -159,6 +159,7 @@ def get_user_details(request: fastapi.Request): interfaces as storage_interfaces, ) from cloud_pipelines_backend import orchestrator_sql +from cloud_pipelines_backend import sql_event_listeners def run_orchestrator( diff --git a/tests/test_sql_event_listeners.py b/tests/test_sql_event_listeners.py new file mode 100644 index 0000000..ddde872 --- /dev/null +++ b/tests/test_sql_event_listeners.py @@ -0,0 +1,49 @@ +"""Tests for cloud_pipelines_backend.sql_event_listeners.""" + +import unittest.mock + +import pytest +from sqlalchemy import orm + +from cloud_pipelines_backend import backend_types_sql as bts +from cloud_pipelines_backend import database_ops +from cloud_pipelines_backend import sql_event_listeners # noqa: F401 — registers listeners +from cloud_pipelines_backend.instrumentation import metrics + + +@pytest.fixture() +def session() -> orm.Session: + db_engine = database_ops.create_db_engine(database_uri="sqlite://") + bts._TableBase.metadata.create_all(db_engine) + with orm.Session(db_engine) as s: + yield s + + +class TestStatusHistoryListeners: + def test_status_change_appends_history_to_extra_data(self, session: orm.Session) -> None: + node = bts.ExecutionNode(task_spec={}) + session.add(node) + node.container_execution_status = bts.ContainerExecutionStatus.QUEUED + session.commit() + + history = node.extra_data[bts.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY] + assert len(history) == 1 + assert history[0]["status"] == bts.ContainerExecutionStatus.QUEUED + + def test_second_status_change_records_duration_metric(self, session: orm.Session) -> None: + node = bts.ExecutionNode(task_spec={}) + session.add(node) + node.container_execution_status = bts.ContainerExecutionStatus.QUEUED + session.commit() + + node.container_execution_status = bts.ContainerExecutionStatus.RUNNING + with unittest.mock.patch.object( + metrics.execution_status_transition_duration, "record" + ) as mock_record: + session.commit() + + mock_record.assert_called_once() + assert mock_record.call_args.kwargs["attributes"] == { + "execution.status.from": bts.ContainerExecutionStatus.QUEUED, + "execution.status.to": bts.ContainerExecutionStatus.RUNNING, + }