From 46669a3a37c238900daefb6aaa9fff05f3497eb0 Mon Sep 17 00:00:00 2001 From: Major Hayden Date: Tue, 14 Apr 2026 08:05:16 -0500 Subject: [PATCH] fix: plumb LLM token counts into rlsapi Splunk telemetry total_llm_tokens was hardcoded to 0 in the rlsapi Splunk event builder despite token counting being implemented via extract_token_usage(). Add input_tokens and output_tokens to InferenceEventData and pass actual counts from the endpoint handler. Ref: RSPEED-2857 Signed-off-by: Major Hayden --- src/app/endpoints/rlsapi_v1.py | 20 +++++++++++++- src/observability/formats/rlsapi.py | 5 ++-- .../endpoints/test_rlsapi_v1_integration.py | 20 ++++++++++++++ tests/unit/app/endpoints/test_rlsapi_v1.py | 8 ++++++ .../unit/observability/formats/test_rlsapi.py | 27 +++++++++++++++++++ 5 files changed, 77 insertions(+), 3 deletions(-) diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py index cb6d36588..360bedbf1 100644 --- a/src/app/endpoints/rlsapi_v1.py +++ b/src/app/endpoints/rlsapi_v1.py @@ -353,8 +353,22 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position response_text: str, inference_time: float, sourcetype: str, + input_tokens: int = 0, + output_tokens: int = 0, ) -> None: - """Build and queue a Splunk telemetry event for background sending.""" + """Build and queue a Splunk telemetry event for background sending. + + Args: + background_tasks: FastAPI background task manager. + infer_request: Original rlsapi v1 inference request. + request: FastAPI request object used to resolve identity context. + request_id: Unique identifier for the request. + response_text: Response text to include in the telemetry event. + inference_time: Request processing duration in seconds. + sourcetype: Splunk sourcetype to use when sending the event. + input_tokens: Number of prompt tokens consumed by the LLM call. + output_tokens: Number of completion tokens produced by the LLM call. + """ org_id, system_id = _get_rh_identity_context(request) systeminfo = infer_request.context.systeminfo @@ -370,6 +384,8 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position system_os=systeminfo.os, system_version=systeminfo.version, system_arch=systeminfo.arch, + input_tokens=input_tokens, + output_tokens=output_tokens, ) event = build_inference_event(event_data) @@ -754,6 +770,8 @@ async def infer_endpoint( # pylint: disable=R0914 response_text, inference_time, "infer_with_llm", + input_tokens=token_usage.input_tokens, + output_tokens=token_usage.output_tokens, ) logger.info("Completed rlsapi v1 /infer request %s", request_id) diff --git a/src/observability/formats/rlsapi.py b/src/observability/formats/rlsapi.py index 1bc110805..67f7065f8 100644 --- a/src/observability/formats/rlsapi.py +++ b/src/observability/formats/rlsapi.py @@ -26,6 +26,8 @@ class InferenceEventData: # pylint: disable=too-many-instance-attributes system_os: str system_version: str system_arch: str + input_tokens: int = 0 + output_tokens: int = 0 def build_inference_event(data: InferenceEventData) -> dict[str, Any]: @@ -47,8 +49,7 @@ def build_inference_event(data: InferenceEventData) -> dict[str, Any]: "deployment": configuration.deployment_environment, "org_id": data.org_id, "system_id": data.system_id, - # Token counting not yet implemented in lightspeed-stack; rlsapi uses 0 as default - "total_llm_tokens": 0, + "total_llm_tokens": data.input_tokens + data.output_tokens, "request_id": data.request_id, "cla_version": data.cla_version, "system_os": data.system_os, diff --git a/tests/integration/endpoints/test_rlsapi_v1_integration.py b/tests/integration/endpoints/test_rlsapi_v1_integration.py index bfcccb464..db9fdfe77 100644 --- a/tests/integration/endpoints/test_rlsapi_v1_integration.py +++ b/tests/integration/endpoints/test_rlsapi_v1_integration.py @@ -115,6 +115,10 @@ def _setup_responses_mock( """Set up responses.create mock with the given response text.""" mock_response = mocker.Mock() mock_response.output = [_create_mock_response_output(mocker, response_text)] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage mock_responses = mocker.Mock() mock_responses.create = mocker.AsyncMock(return_value=mock_response) @@ -303,6 +307,10 @@ async def test_rlsapi_v1_infer_fallback_response_empty_output( mock_response = mocker.Mock() mock_response.output = [] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage mock_responses = mocker.Mock() mock_responses.create = mocker.AsyncMock(return_value=mock_response) @@ -342,6 +350,10 @@ async def test_rlsapi_v1_infer_input_source_combination( mock_response = mocker.Mock() mock_response.output = [_create_mock_response_output(mocker, "response text")] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage mock_responses = mocker.Mock() mock_responses.create = mocker.AsyncMock(return_value=mock_response) @@ -401,6 +413,10 @@ async def test_rlsapi_v1_infer_no_mcp_servers_passes_empty_tools( mock_response = mocker.Mock() mock_response.output = [_create_mock_response_output(mocker, "response text")] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage mock_responses = mocker.Mock() mock_responses.create = mocker.AsyncMock(return_value=mock_response) @@ -442,6 +458,10 @@ async def test_rlsapi_v1_infer_mcp_tools_passed_to_llm( mock_response = mocker.Mock() mock_response.output = [_create_mock_response_output(mocker, "enriched response")] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage mock_responses = mocker.Mock() mock_responses.create = mocker.AsyncMock(return_value=mock_response) diff --git a/tests/unit/app/endpoints/test_rlsapi_v1.py b/tests/unit/app/endpoints/test_rlsapi_v1.py index edf5a346f..213e3ed20 100644 --- a/tests/unit/app/endpoints/test_rlsapi_v1.py +++ b/tests/unit/app/endpoints/test_rlsapi_v1.py @@ -116,6 +116,10 @@ def mock_llm_response_fixture(mocker: MockerFixture) -> None: mock_response.output = [ _create_mock_response_output(mocker, "This is a test LLM response.") ] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage _setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response)) @@ -124,6 +128,10 @@ def mock_empty_llm_response_fixture(mocker: MockerFixture) -> None: """Mock responses.create to return empty output list.""" mock_response = mocker.Mock() mock_response.output = [] + mock_usage = mocker.Mock() + mock_usage.input_tokens = 10 + mock_usage.output_tokens = 5 + mock_response.usage = mock_usage _setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response)) diff --git a/tests/unit/observability/formats/test_rlsapi.py b/tests/unit/observability/formats/test_rlsapi.py index 4ea8624ac..e6e997d73 100644 --- a/tests/unit/observability/formats/test_rlsapi.py +++ b/tests/unit/observability/formats/test_rlsapi.py @@ -50,6 +50,33 @@ def test_builds_event_with_all_fields( assert event["total_llm_tokens"] == 0 +def test_builds_event_with_token_counts(mocker: MockerFixture) -> None: + """Test total_llm_tokens is computed from input and output token counts.""" + mocker.patch( + "observability.formats.rlsapi.configuration" + ).deployment_environment = "production" + + data = InferenceEventData( + question="test", + response="test", + inference_time=1.0, + model="test-model", + org_id="org1", + system_id="sys1", + request_id="req_1", + cla_version="CLA/1.0", + system_os="RHEL", + system_version="9.4", + system_arch="x86_64", + input_tokens=150, + output_tokens=75, + ) + + event = build_inference_event(data) + + assert event["total_llm_tokens"] == 225 + + def test_handles_auth_disabled_values(mocker: MockerFixture) -> None: """Test event handles auth_disabled placeholder values.""" data = InferenceEventData(