diff --git a/api/commands.py b/api/commands.py index 5c13667efe..93ddc379fb 100644 --- a/api/commands.py +++ b/api/commands.py @@ -2630,6 +2630,8 @@ def clean_expired_messages( """ Clean expired messages and related data for tenants based on clean policy. """ + from extensions.otel.runtime import flush_telemetry + click.echo(click.style("clean_messages: start clean messages.", fg="green")) start_at = time.perf_counter() @@ -2671,6 +2673,8 @@ def clean_expired_messages( ) ) raise + finally: + flush_telemetry() click.echo(click.style("messages cleanup completed.", fg="green")) diff --git a/api/services/retention/conversation/messages_clean_service.py b/api/services/retention/conversation/messages_clean_service.py index f7836a2b14..86bf57c64e 100644 --- a/api/services/retention/conversation/messages_clean_service.py +++ b/api/services/retention/conversation/messages_clean_service.py @@ -1,16 +1,16 @@ import datetime import logging -import os import random import time from collections.abc import Sequence -from typing import cast +from typing import TYPE_CHECKING, cast import sqlalchemy as sa from sqlalchemy import delete, select, tuple_ from sqlalchemy.engine import CursorResult from sqlalchemy.orm import Session +from configs import dify_config from extensions.ext_database import db from models.model import ( App, @@ -32,6 +32,127 @@ from services.retention.conversation.messages_clean_policy import ( logger = logging.getLogger(__name__) +if TYPE_CHECKING: + from opentelemetry.metrics import Counter, Histogram + + +class MessagesCleanupMetrics: + """ + Records low-cardinality OpenTelemetry metrics for expired message cleanup jobs. + + We keep labels stable (dry_run/window_mode/status) so these metrics remain + dashboard-friendly for long-running CronJob executions. + """ + + _job_runs_total: "Counter | None" + _batches_total: "Counter | None" + _messages_scanned_total: "Counter | None" + _messages_filtered_total: "Counter | None" + _messages_deleted_total: "Counter | None" + _job_duration_seconds: "Histogram | None" + _batch_duration_seconds: "Histogram | None" + _base_attributes: dict[str, str] + + def __init__(self, *, dry_run: bool, has_window: bool) -> None: + self._job_runs_total = None + self._batches_total = None + self._messages_scanned_total = None + self._messages_filtered_total = None + self._messages_deleted_total = None + self._job_duration_seconds = None + self._batch_duration_seconds = None + self._base_attributes = { + "job_name": "messages_cleanup", + "dry_run": str(dry_run).lower(), + "window_mode": "between" if has_window else "before_cutoff", + } + self._init_instruments() + + def _init_instruments(self) -> None: + try: + from opentelemetry.metrics import get_meter + + meter = get_meter("messages_cleanup", version=dify_config.project.version) + self._job_runs_total = meter.create_counter( + "messages_cleanup_jobs_total", + description="Total number of expired message cleanup jobs by status.", + unit="{job}", + ) + self._batches_total = meter.create_counter( + "messages_cleanup_batches_total", + description="Total number of message cleanup batches processed.", + unit="{batch}", + ) + self._messages_scanned_total = meter.create_counter( + "messages_cleanup_scanned_messages_total", + description="Total messages scanned by cleanup jobs.", + unit="{message}", + ) + self._messages_filtered_total = meter.create_counter( + "messages_cleanup_filtered_messages_total", + description="Total messages selected by cleanup policy.", + unit="{message}", + ) + self._messages_deleted_total = meter.create_counter( + "messages_cleanup_deleted_messages_total", + description="Total messages deleted by cleanup jobs.", + unit="{message}", + ) + self._job_duration_seconds = meter.create_histogram( + "messages_cleanup_job_duration_seconds", + description="Duration of expired message cleanup jobs in seconds.", + unit="s", + ) + self._batch_duration_seconds = meter.create_histogram( + "messages_cleanup_batch_duration_seconds", + description="Duration of expired message cleanup batch processing in seconds.", + unit="s", + ) + except Exception: + logger.exception("messages_cleanup_metrics: failed to initialize instruments") + + def _attrs(self, **extra: str) -> dict[str, str]: + return {**self._base_attributes, **extra} + + @staticmethod + def _add(counter: "Counter | None", value: int, attributes: dict[str, str]) -> None: + if not counter or value <= 0: + return + try: + counter.add(value, attributes) + except Exception: + logger.exception("messages_cleanup_metrics: failed to add counter value") + + @staticmethod + def _record(histogram: "Histogram | None", value: float, attributes: dict[str, str]) -> None: + if not histogram: + return + try: + histogram.record(value, attributes) + except Exception: + logger.exception("messages_cleanup_metrics: failed to record histogram value") + + def record_batch( + self, + *, + scanned_messages: int, + filtered_messages: int, + deleted_messages: int, + batch_duration_seconds: float, + ) -> None: + attributes = self._attrs() + self._add(self._batches_total, 1, attributes) + self._add(self._messages_scanned_total, scanned_messages, attributes) + self._add(self._messages_filtered_total, filtered_messages, attributes) + self._add(self._messages_deleted_total, deleted_messages, attributes) + self._record(self._batch_duration_seconds, batch_duration_seconds, attributes) + + def record_completion(self, *, status: str, job_duration_seconds: float) -> None: + attributes = self._attrs(status=status) + self._add(self._job_runs_total, 1, attributes) + self._record(self._job_duration_seconds, job_duration_seconds, attributes) + + class MessagesCleanService: """ Service for cleaning expired messages based on retention policies. @@ -63,6 +184,7 @@ class MessagesCleanService: self._start_from = start_from self._batch_size = batch_size self._dry_run = dry_run + self._metrics = MessagesCleanupMetrics(dry_run=dry_run, has_window=bool(start_from)) @classmethod def from_time_range( @@ -161,7 +283,18 @@ class MessagesCleanService: Returns: Dict with statistics: batches, filtered_messages, total_deleted """ - return self._clean_messages_by_time_range() + status = "success" + run_start = time.monotonic() + try: + return self._clean_messages_by_time_range() + except Exception: + status = "failed" + raise + finally: + self._metrics.record_completion( + status=status, + job_duration_seconds=time.monotonic() - run_start, + ) def _clean_messages_by_time_range(self) -> dict[str, int]: """ @@ -196,11 +329,14 @@ class MessagesCleanService: self._end_before, ) - max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200)) + max_batch_interval_ms = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL while True: stats["batches"] += 1 batch_start = time.monotonic() + batch_scanned_messages = 0 + batch_filtered_messages = 0 + batch_deleted_messages = 0 # Step 1: Fetch a batch of messages using cursor with Session(db.engine, expire_on_commit=False) as session: @@ -239,9 +375,16 @@ class MessagesCleanService: # Track total messages fetched across all batches stats["total_messages"] += len(messages) + batch_scanned_messages = len(messages) if not messages: logger.info("clean_messages (batch %s): no more messages to process", stats["batches"]) + self._metrics.record_batch( + scanned_messages=batch_scanned_messages, + filtered_messages=batch_filtered_messages, + deleted_messages=batch_deleted_messages, + batch_duration_seconds=time.monotonic() - batch_start, + ) break # Update cursor to the last message's (created_at, id) @@ -267,6 +410,12 @@ class MessagesCleanService: if not apps: logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"]) + self._metrics.record_batch( + scanned_messages=batch_scanned_messages, + filtered_messages=batch_filtered_messages, + deleted_messages=batch_deleted_messages, + batch_duration_seconds=time.monotonic() - batch_start, + ) continue # Build app_id -> tenant_id mapping @@ -285,9 +434,16 @@ class MessagesCleanService: if not message_ids_to_delete: logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"]) + self._metrics.record_batch( + scanned_messages=batch_scanned_messages, + filtered_messages=batch_filtered_messages, + deleted_messages=batch_deleted_messages, + batch_duration_seconds=time.monotonic() - batch_start, + ) continue stats["filtered_messages"] += len(message_ids_to_delete) + batch_filtered_messages = len(message_ids_to_delete) # Step 4: Batch delete messages and their relations if not self._dry_run: @@ -308,6 +464,7 @@ class MessagesCleanService: commit_ms = int((time.monotonic() - commit_start) * 1000) stats["total_deleted"] += messages_deleted + batch_deleted_messages = messages_deleted logger.info( "clean_messages (batch %s): processed %s messages, deleted %s messages", @@ -342,6 +499,13 @@ class MessagesCleanService: for msg_id in sampled_ids: logger.info("clean_messages (batch %s, dry_run) sample: message_id=%s", stats["batches"], msg_id) + self._metrics.record_batch( + scanned_messages=batch_scanned_messages, + filtered_messages=batch_filtered_messages, + deleted_messages=batch_deleted_messages, + batch_duration_seconds=time.monotonic() - batch_start, + ) + logger.info( "clean_messages completed: total batches: %s, total messages: %s, filtered messages: %s, total deleted: %s", stats["batches"], diff --git a/api/tests/unit_tests/services/test_messages_clean_service.py b/api/tests/unit_tests/services/test_messages_clean_service.py index 67ae2c9142..b8e7d2fb01 100644 --- a/api/tests/unit_tests/services/test_messages_clean_service.py +++ b/api/tests/unit_tests/services/test_messages_clean_service.py @@ -625,3 +625,53 @@ class TestMessagesCleanServiceFromDays: assert service._end_before == expected_end_before assert service._batch_size == 1000 # default assert service._dry_run is False # default + + +class TestMessagesCleanServiceRun: + """Unit tests for MessagesCleanService.run instrumentation behavior.""" + + def test_run_records_completion_metrics_on_success(self): + # Arrange + service = MessagesCleanService( + policy=BillingDisabledPolicy(), + start_from=datetime.datetime(2024, 1, 1), + end_before=datetime.datetime(2024, 1, 2), + batch_size=100, + dry_run=False, + ) + expected_stats = { + "batches": 1, + "total_messages": 10, + "filtered_messages": 5, + "total_deleted": 5, + } + service._clean_messages_by_time_range = MagicMock(return_value=expected_stats) # type: ignore[method-assign] + completion_calls: list[dict[str, object]] = [] + service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs) # type: ignore[method-assign] + + # Act + result = service.run() + + # Assert + assert result == expected_stats + assert len(completion_calls) == 1 + assert completion_calls[0]["status"] == "success" + + def test_run_records_completion_metrics_on_failure(self): + # Arrange + service = MessagesCleanService( + policy=BillingDisabledPolicy(), + start_from=datetime.datetime(2024, 1, 1), + end_before=datetime.datetime(2024, 1, 2), + batch_size=100, + dry_run=False, + ) + service._clean_messages_by_time_range = MagicMock(side_effect=RuntimeError("clean failed")) # type: ignore[method-assign] + completion_calls: list[dict[str, object]] = [] + service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs) # type: ignore[method-assign] + + # Act & Assert + with pytest.raises(RuntimeError, match="clean failed"): + service.run() + assert len(completion_calls) == 1 + assert completion_calls[0]["status"] == "failed"