add clean message metrics

2026-05-09 21:28:25 +08:00 · 2026-03-09 13:45:08 +08:00 · 2026-03-09 13:45:08 +08:00 · b0e8becd14
commit b0e8becd14
parent f90e0d781a
3 changed files with 222 additions and 4 deletions
--- a/api/commands.py
+++ b/api/commands.py
@ -2630,6 +2630,8 @@ def clean_expired_messages(
    """
    Clean expired messages and related data for tenants based on clean policy.
    """
+    from extensions.otel.runtime import flush_telemetry
+
    click.echo(click.style("clean_messages: start clean messages.", fg="green"))

    start_at = time.perf_counter()
@ -2671,6 +2673,8 @@ def clean_expired_messages(
            )
        )
        raise
+    finally:
+        flush_telemetry()

    click.echo(click.style("messages cleanup completed.", fg="green"))

--- a/api/services/retention/conversation/messages_clean_service.py
+++ b/api/services/retention/conversation/messages_clean_service.py
@ -1,16 +1,16 @@
 import datetime
 import logging
-import os
 import random
 import time
 from collections.abc import Sequence
-from typing import cast
+from typing import TYPE_CHECKING, cast

 import sqlalchemy as sa
 from sqlalchemy import delete, select, tuple_
 from sqlalchemy.engine import CursorResult
 from sqlalchemy.orm import Session

+from configs import dify_config
 from extensions.ext_database import db
 from models.model import (
    App,
@ -32,6 +32,127 @@ from services.retention.conversation.messages_clean_policy import (
 logger = logging.getLogger(__name__)


+if TYPE_CHECKING:
+    from opentelemetry.metrics import Counter, Histogram
+
+
+class MessagesCleanupMetrics:
+    """
+    Records low-cardinality OpenTelemetry metrics for expired message cleanup jobs.
+
+    We keep labels stable (dry_run/window_mode/status) so these metrics remain
+    dashboard-friendly for long-running CronJob executions.
+    """
+
+    _job_runs_total: "Counter | None"
+    _batches_total: "Counter | None"
+    _messages_scanned_total: "Counter | None"
+    _messages_filtered_total: "Counter | None"
+    _messages_deleted_total: "Counter | None"
+    _job_duration_seconds: "Histogram | None"
+    _batch_duration_seconds: "Histogram | None"
+    _base_attributes: dict[str, str]
+
+    def __init__(self, *, dry_run: bool, has_window: bool) -> None:
+        self._job_runs_total = None
+        self._batches_total = None
+        self._messages_scanned_total = None
+        self._messages_filtered_total = None
+        self._messages_deleted_total = None
+        self._job_duration_seconds = None
+        self._batch_duration_seconds = None
+        self._base_attributes = {
+            "job_name": "messages_cleanup",
+            "dry_run": str(dry_run).lower(),
+            "window_mode": "between" if has_window else "before_cutoff",
+        }
+        self._init_instruments()
+
+    def _init_instruments(self) -> None:
+        try:
+            from opentelemetry.metrics import get_meter
+
+            meter = get_meter("messages_cleanup", version=dify_config.project.version)
+            self._job_runs_total = meter.create_counter(
+                "messages_cleanup_jobs_total",
+                description="Total number of expired message cleanup jobs by status.",
+                unit="{job}",
+            )
+            self._batches_total = meter.create_counter(
+                "messages_cleanup_batches_total",
+                description="Total number of message cleanup batches processed.",
+                unit="{batch}",
+            )
+            self._messages_scanned_total = meter.create_counter(
+                "messages_cleanup_scanned_messages_total",
+                description="Total messages scanned by cleanup jobs.",
+                unit="{message}",
+            )
+            self._messages_filtered_total = meter.create_counter(
+                "messages_cleanup_filtered_messages_total",
+                description="Total messages selected by cleanup policy.",
+                unit="{message}",
+            )
+            self._messages_deleted_total = meter.create_counter(
+                "messages_cleanup_deleted_messages_total",
+                description="Total messages deleted by cleanup jobs.",
+                unit="{message}",
+            )
+            self._job_duration_seconds = meter.create_histogram(
+                "messages_cleanup_job_duration_seconds",
+                description="Duration of expired message cleanup jobs in seconds.",
+                unit="s",
+            )
+            self._batch_duration_seconds = meter.create_histogram(
+                "messages_cleanup_batch_duration_seconds",
+                description="Duration of expired message cleanup batch processing in seconds.",
+                unit="s",
+            )
+        except Exception:
+            logger.exception("messages_cleanup_metrics: failed to initialize instruments")
+
+    def _attrs(self, **extra: str) -> dict[str, str]:
+        return {**self._base_attributes, **extra}
+
+    @staticmethod
+    def _add(counter: "Counter | None", value: int, attributes: dict[str, str]) -> None:
+        if not counter or value <= 0:
+            return
+        try:
+            counter.add(value, attributes)
+        except Exception:
+            logger.exception("messages_cleanup_metrics: failed to add counter value")
+
+    @staticmethod
+    def _record(histogram: "Histogram | None", value: float, attributes: dict[str, str]) -> None:
+        if not histogram:
+            return
+        try:
+            histogram.record(value, attributes)
+        except Exception:
+            logger.exception("messages_cleanup_metrics: failed to record histogram value")
+
+    def record_batch(
+        self,
+        *,
+        scanned_messages: int,
+        filtered_messages: int,
+        deleted_messages: int,
+        batch_duration_seconds: float,
+    ) -> None:
+        attributes = self._attrs()
+        self._add(self._batches_total, 1, attributes)
+        self._add(self._messages_scanned_total, scanned_messages, attributes)
+        self._add(self._messages_filtered_total, filtered_messages, attributes)
+        self._add(self._messages_deleted_total, deleted_messages, attributes)
+        self._record(self._batch_duration_seconds, batch_duration_seconds, attributes)
+
+    def record_completion(self, *, status: str, job_duration_seconds: float) -> None:
+        attributes = self._attrs(status=status)
+        self._add(self._job_runs_total, 1, attributes)
+        self._record(self._job_duration_seconds, job_duration_seconds, attributes)
+
+
 class MessagesCleanService:
    """
    Service for cleaning expired messages based on retention policies.
@ -63,6 +184,7 @@ class MessagesCleanService:
        self._start_from = start_from
        self._batch_size = batch_size
        self._dry_run = dry_run
+        self._metrics = MessagesCleanupMetrics(dry_run=dry_run, has_window=bool(start_from))

    @classmethod
    def from_time_range(
@ -161,7 +283,18 @@ class MessagesCleanService:
        Returns:
            Dict with statistics: batches, filtered_messages, total_deleted
        """
-        return self._clean_messages_by_time_range()
+        status = "success"
+        run_start = time.monotonic()
+        try:
+            return self._clean_messages_by_time_range()
+        except Exception:
+            status = "failed"
+            raise
+        finally:
+            self._metrics.record_completion(
+                status=status,
+                job_duration_seconds=time.monotonic() - run_start,
+            )

    def _clean_messages_by_time_range(self) -> dict[str, int]:
        """
@ -196,11 +329,14 @@ class MessagesCleanService:
            self._end_before,
        )

-        max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200))
+        max_batch_interval_ms = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL

        while True:
            stats["batches"] += 1
            batch_start = time.monotonic()
+            batch_scanned_messages = 0
+            batch_filtered_messages = 0
+            batch_deleted_messages = 0

            # Step 1: Fetch a batch of messages using cursor
            with Session(db.engine, expire_on_commit=False) as session:
@ -239,9 +375,16 @@ class MessagesCleanService:

                # Track total messages fetched across all batches
                stats["total_messages"] += len(messages)
+                batch_scanned_messages = len(messages)

                if not messages:
                    logger.info("clean_messages (batch %s): no more messages to process", stats["batches"])
+                    self._metrics.record_batch(
+                        scanned_messages=batch_scanned_messages,
+                        filtered_messages=batch_filtered_messages,
+                        deleted_messages=batch_deleted_messages,
+                        batch_duration_seconds=time.monotonic() - batch_start,
+                    )
                    break

                # Update cursor to the last message's (created_at, id)
@ -267,6 +410,12 @@ class MessagesCleanService:

            if not apps:
                logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"])
+                self._metrics.record_batch(
+                    scanned_messages=batch_scanned_messages,
+                    filtered_messages=batch_filtered_messages,
+                    deleted_messages=batch_deleted_messages,
+                    batch_duration_seconds=time.monotonic() - batch_start,
+                )
                continue

            # Build app_id -> tenant_id mapping
@ -285,9 +434,16 @@ class MessagesCleanService:

            if not message_ids_to_delete:
                logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"])
+                self._metrics.record_batch(
+                    scanned_messages=batch_scanned_messages,
+                    filtered_messages=batch_filtered_messages,
+                    deleted_messages=batch_deleted_messages,
+                    batch_duration_seconds=time.monotonic() - batch_start,
+                )
                continue

            stats["filtered_messages"] += len(message_ids_to_delete)
+            batch_filtered_messages = len(message_ids_to_delete)

            # Step 4: Batch delete messages and their relations
            if not self._dry_run:
@ -308,6 +464,7 @@ class MessagesCleanService:
                    commit_ms = int((time.monotonic() - commit_start) * 1000)

                    stats["total_deleted"] += messages_deleted
+                    batch_deleted_messages = messages_deleted

                    logger.info(
                        "clean_messages (batch %s): processed %s messages, deleted %s messages",
@ -342,6 +499,13 @@ class MessagesCleanService:
                for msg_id in sampled_ids:
                    logger.info("clean_messages (batch %s, dry_run) sample: message_id=%s", stats["batches"], msg_id)

+            self._metrics.record_batch(
+                scanned_messages=batch_scanned_messages,
+                filtered_messages=batch_filtered_messages,
+                deleted_messages=batch_deleted_messages,
+                batch_duration_seconds=time.monotonic() - batch_start,
+            )
+
        logger.info(
            "clean_messages completed: total batches: %s, total messages: %s, filtered messages: %s, total deleted: %s",
            stats["batches"],
--- a/api/tests/unit_tests/services/test_messages_clean_service.py
+++ b/api/tests/unit_tests/services/test_messages_clean_service.py
@ -625,3 +625,53 @@ class TestMessagesCleanServiceFromDays:
        assert service._end_before == expected_end_before
        assert service._batch_size == 1000  # default
        assert service._dry_run is False  # default
+
+
+class TestMessagesCleanServiceRun:
+    """Unit tests for MessagesCleanService.run instrumentation behavior."""
+
+    def test_run_records_completion_metrics_on_success(self):
+        # Arrange
+        service = MessagesCleanService(
+            policy=BillingDisabledPolicy(),
+            start_from=datetime.datetime(2024, 1, 1),
+            end_before=datetime.datetime(2024, 1, 2),
+            batch_size=100,
+            dry_run=False,
+        )
+        expected_stats = {
+            "batches": 1,
+            "total_messages": 10,
+            "filtered_messages": 5,
+            "total_deleted": 5,
+        }
+        service._clean_messages_by_time_range = MagicMock(return_value=expected_stats)  # type: ignore[method-assign]
+        completion_calls: list[dict[str, object]] = []
+        service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs)  # type: ignore[method-assign]
+
+        # Act
+        result = service.run()
+
+        # Assert
+        assert result == expected_stats
+        assert len(completion_calls) == 1
+        assert completion_calls[0]["status"] == "success"
+
+    def test_run_records_completion_metrics_on_failure(self):
+        # Arrange
+        service = MessagesCleanService(
+            policy=BillingDisabledPolicy(),
+            start_from=datetime.datetime(2024, 1, 1),
+            end_before=datetime.datetime(2024, 1, 2),
+            batch_size=100,
+            dry_run=False,
+        )
+        service._clean_messages_by_time_range = MagicMock(side_effect=RuntimeError("clean failed"))  # type: ignore[method-assign]
+        completion_calls: list[dict[str, object]] = []
+        service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs)  # type: ignore[method-assign]
+
+        # Act & Assert
+        with pytest.raises(RuntimeError, match="clean failed"):
+            service.run()
+        assert len(completion_calls) == 1
+        assert completion_calls[0]["status"] == "failed"