-
Notifications
You must be signed in to change notification settings - Fork 16
feat: Measure execution status duration #174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| """SQLAlchemy event listeners for cloud_pipelines_backend models. | ||
|
|
||
| This module registers global SQLAlchemy event hooks. It must be imported at | ||
| application startup (start_local.py, orchestrator_main_oasis.py, etc.) for the | ||
| listeners to take effect. | ||
| """ | ||
|
|
||
| import datetime | ||
| import logging | ||
| import typing | ||
|
|
||
| from sqlalchemy import event as sql_event | ||
| from sqlalchemy import orm | ||
|
|
||
| from . import backend_types_sql | ||
| from .instrumentation import metrics | ||
|
|
||
| _logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| @sql_event.listens_for(backend_types_sql.ExecutionNode.container_execution_status, "set") | ||
| def _handle_container_execution_status_set( | ||
| execution: backend_types_sql.ExecutionNode, | ||
| value: typing.Any, | ||
| _old_value: typing.Any, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to check whether new value is different from the old value? |
||
| _initiator: typing.Any, | ||
| ) -> None: | ||
| if value is None: | ||
| return | ||
| if execution.extra_data is None: | ||
| execution.extra_data = {} | ||
| history: list = execution.extra_data.get( | ||
| backend_types_sql.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY, [] | ||
| ) | ||
| entry = { | ||
| "status": value.value, | ||
| "first_observed_at": datetime.datetime.now(datetime.timezone.utc).strftime( | ||
| "%Y-%m-%dT%H:%M:%SZ" | ||
| ), | ||
| } | ||
| execution.extra_data = { | ||
| **execution.extra_data, | ||
| backend_types_sql.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY: history + [entry], | ||
|
Comment on lines
+21
to
+43
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's move this whole function to the orchestrator module. I think maintaining the status history becomes part of it's job, so it belongs there. I'm kind of on the fence here, but modifications to ExecutionNode is probably the jurisdiction of the orchestrator. And |
||
| } | ||
| execution._status_changed = True | ||
|
|
||
|
|
||
| @sql_event.listens_for(orm.Session, "before_commit") | ||
| def _handle_before_commit(session: orm.Session) -> None: | ||
| for obj in list(session.new) + list(session.dirty): | ||
| if not isinstance(obj, backend_types_sql.ExecutionNode): | ||
| continue | ||
| if not obj._status_changed: | ||
| continue | ||
| history: list = (obj.extra_data or {}).get( | ||
| backend_types_sql.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY, [] | ||
| ) | ||
| if len(history) >= 2: | ||
| prev = history[-2] | ||
| curr = history[-1] | ||
| prev_time = datetime.datetime.fromisoformat(prev["first_observed_at"]) | ||
| curr_time = datetime.datetime.fromisoformat(curr["first_observed_at"]) | ||
| try: | ||
| metrics.execution_status_transition_duration.record( | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OTel is built to never throw a Runtime exception, but in case it ever did, I don't want that to result in a rollback of the commit - so I've wrapped this with a try/except. |
||
| (curr_time - prev_time).total_seconds(), | ||
| attributes={ | ||
| "execution.status.from": prev["status"], | ||
| "execution.status.to": curr["status"], | ||
| }, | ||
| ) | ||
| except Exception: | ||
| _logger.warning( | ||
| f"Failed to record status transition metric for execution {obj.id!r}", | ||
| exc_info=True, | ||
| ) | ||
| obj._status_changed = False | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| """Tests for cloud_pipelines_backend.sql_event_listeners.""" | ||
|
|
||
| import unittest.mock | ||
|
|
||
| import pytest | ||
| from sqlalchemy import orm | ||
|
|
||
| from cloud_pipelines_backend import backend_types_sql as bts | ||
| from cloud_pipelines_backend import database_ops | ||
| from cloud_pipelines_backend import sql_event_listeners # noqa: F401 — registers listeners | ||
| from cloud_pipelines_backend.instrumentation import metrics | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def session() -> orm.Session: | ||
| db_engine = database_ops.create_db_engine(database_uri="sqlite://") | ||
| bts._TableBase.metadata.create_all(db_engine) | ||
| with orm.Session(db_engine) as s: | ||
| yield s | ||
|
|
||
|
|
||
| class TestStatusHistoryListeners: | ||
| def test_status_change_appends_history_to_extra_data(self, session: orm.Session) -> None: | ||
| node = bts.ExecutionNode(task_spec={}) | ||
| session.add(node) | ||
| node.container_execution_status = bts.ContainerExecutionStatus.QUEUED | ||
| session.commit() | ||
|
|
||
| history = node.extra_data[bts.EXECUTION_NODE_EXTRA_DATA_STATUS_HISTORY_KEY] | ||
| assert len(history) == 1 | ||
| assert history[0]["status"] == bts.ContainerExecutionStatus.QUEUED | ||
|
|
||
| def test_second_status_change_records_duration_metric(self, session: orm.Session) -> None: | ||
| node = bts.ExecutionNode(task_spec={}) | ||
| session.add(node) | ||
| node.container_execution_status = bts.ContainerExecutionStatus.QUEUED | ||
| session.commit() | ||
|
|
||
| node.container_execution_status = bts.ContainerExecutionStatus.RUNNING | ||
| with unittest.mock.patch.object( | ||
| metrics.execution_status_transition_duration, "record" | ||
| ) as mock_record: | ||
| session.commit() | ||
|
|
||
| mock_record.assert_called_once() | ||
| assert mock_record.call_args.kwargs["attributes"] == { | ||
| "execution.status.from": bts.ContainerExecutionStatus.QUEUED, | ||
| "execution.status.to": bts.ContainerExecutionStatus.RUNNING, | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe let's add type for value?