From fe741140d555627d847a2a29e502d32e9327b9ad Mon Sep 17 00:00:00 2001 From: GareArc Date: Mon, 2 Mar 2026 04:14:57 -0800 Subject: [PATCH] fix(telemetry): fix zero-value message and workflow duration histograms Workflow RT: replace float(info.workflow_run_elapsed_time) with (end_time - start_time).total_seconds() using workflow_run.created_at and workflow_run.finished_at. The elapsed_time DB field defaults to 0 and can be stale if the workflow_storage Celery task has not committed yet when the trace fires. Wall-clock timestamps are more reliable; elapsed_time is kept as fallback. Message RT: change end_time from created_at + provider_response_latency to message.updated_at when updated_at > created_at. The pipeline explicitly sets message.updated_at = naive_utc_now() at the moment the LLM response is complete, making it the canonical response-complete timestamp. Falls back to the latency-based calculation for error/aborted messages. --- api/core/ops/ops_trace_manager.py | 2 +- api/enterprise/telemetry/enterprise_trace.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/api/core/ops/ops_trace_manager.py b/api/core/ops/ops_trace_manager.py index 0731b46eca..8465548bd5 100644 --- a/api/core/ops/ops_trace_manager.py +++ b/api/core/ops/ops_trace_manager.py @@ -904,7 +904,7 @@ class TraceTask: outputs=message_data.answer, file_list=file_list, start_time=created_at, - end_time=created_at + timedelta(seconds=message_data.provider_response_latency), + end_time=message_data.updated_at if message_data.updated_at and message_data.updated_at > created_at else created_at + timedelta(seconds=message_data.provider_response_latency), metadata=metadata, message_file_data=message_file_data, conversation_mode=conversation_mode, diff --git a/api/enterprise/telemetry/enterprise_trace.py b/api/enterprise/telemetry/enterprise_trace.py index c9eeccbcea..6f5754a7a3 100644 --- a/api/enterprise/telemetry/enterprise_trace.py +++ b/api/enterprise/telemetry/enterprise_trace.py @@ -256,9 +256,18 @@ class EnterpriseOtelTrace: invoke_from=invoke_from, ), ) + # Prefer wall-clock timestamps over the elapsed_time field: elapsed_time defaults + # to 0 in the DB and can be stale if the Celery write races with the trace task. + # start_time = workflow_run.created_at, end_time = workflow_run.finished_at. + if info.start_time and info.end_time: + workflow_duration = (info.end_time - info.start_time).total_seconds() + elif info.workflow_run_elapsed_time: + workflow_duration = float(info.workflow_run_elapsed_time) + else: + workflow_duration = 0.0 self._exporter.record_histogram( EnterpriseTelemetryHistogram.WORKFLOW_DURATION, - float(info.workflow_run_elapsed_time), + workflow_duration, self._labels( **labels, status=info.workflow_run_status,