dify/api/libs/db_migration_lock.py

"""
DB migration Redis lock with heartbeat renewal.

This is intentionally migration-specific. Background renewal is a trade-off that makes sense
for unbounded, blocking operations like DB migrations (DDL/DML) where the main thread cannot
periodically refresh the lock TTL.

Do NOT use this as a general-purpose lock primitive for normal application code. Prefer explicit
lock lifecycle management (e.g. redis-py Lock context manager + `extend()` / `reacquire()` from
the same thread) when execution flow is under control.
"""

from __future__ import annotations

import logging
import threading
from typing import Any

from redis.exceptions import LockNotOwnedError, RedisError

logger = logging.getLogger(__name__)

MIN_RENEW_INTERVAL_SECONDS = 0.1
DEFAULT_RENEW_INTERVAL_DIVISOR = 3
MIN_JOIN_TIMEOUT_SECONDS = 0.5
MAX_JOIN_TIMEOUT_SECONDS = 5.0
JOIN_TIMEOUT_MULTIPLIER = 2.0


class DbMigrationAutoRenewLock:
    """
    Redis lock wrapper that automatically renews TTL while held (migration-only).

    Notes:
    - We force `thread_local=False` when creating the underlying redis-py lock, because the
      lock token must be accessible from the heartbeat thread for `reacquire()` to work.
    - `release_safely()` is best-effort: it never raises, so it won't mask the caller's
      primary error/exit code.
    """

    _redis_client: Any
    _name: str
    _ttl_seconds: float
    _renew_interval_seconds: float
    _log_context: str | None
    _logger: logging.Logger

    _lock: Any
    _stop_event: threading.Event | None
    _thread: threading.Thread | None
    _acquired: bool

    def __init__(
        self,
        redis_client: Any,
        name: str,
        ttl_seconds: float = 60,
        renew_interval_seconds: float | None = None,
        *,
        logger: logging.Logger | None = None,
        log_context: str | None = None,
    ) -> None:
        self._redis_client = redis_client
        self._name = name
        self._ttl_seconds = float(ttl_seconds)
        self._renew_interval_seconds = (
            float(renew_interval_seconds)
            if renew_interval_seconds is not None
            else max(MIN_RENEW_INTERVAL_SECONDS, self._ttl_seconds / DEFAULT_RENEW_INTERVAL_DIVISOR)
        )
        self._logger = logger or logging.getLogger(__name__)
        self._log_context = log_context

        self._lock = None
        self._stop_event = None
        self._thread = None
        self._acquired = False

    @property
    def name(self) -> str:
        return self._name

    def acquire(self, *args: Any, **kwargs: Any) -> bool:
        """
        Acquire the lock and start heartbeat renewal on success.

        Accepts the same args/kwargs as redis-py `Lock.acquire()`.
        """
        # Prevent accidental double-acquire which could leave the previous heartbeat thread running.
        if self._acquired:
            raise RuntimeError("DB migration lock is already acquired; call release_safely() before acquiring again.")

        # Reuse the lock object if we already created one.
        if self._lock is None:
            self._lock = self._redis_client.lock(
                name=self._name,
                timeout=self._ttl_seconds,
                thread_local=False,
            )
        acquired = bool(self._lock.acquire(*args, **kwargs))
        self._acquired = acquired
        if acquired:
            self._start_heartbeat()
        return acquired

    def owned(self) -> bool:
        if self._lock is None:
            return False
        try:
            return bool(self._lock.owned())
        except Exception:
            # Ownership checks are best-effort and must not break callers.
            return False

    def _start_heartbeat(self) -> None:
        if self._lock is None:
            return
        if self._stop_event is not None:
            return

        self._stop_event = threading.Event()
        self._thread = threading.Thread(
            target=self._heartbeat_loop,
            args=(self._lock, self._stop_event),
            daemon=True,
            name=f"DbMigrationAutoRenewLock({self._name})",
        )
        self._thread.start()

    def _heartbeat_loop(self, lock: Any, stop_event: threading.Event) -> None:
        while not stop_event.wait(self._renew_interval_seconds):
            try:
                lock.reacquire()
            except LockNotOwnedError:
                self._logger.warning(
                    "DB migration lock is no longer owned during heartbeat; stop renewing. log_context=%s",
                    self._log_context,
                    exc_info=True,
                )
                return
            except RedisError:
                self._logger.warning(
                    "Failed to renew DB migration lock due to Redis error; will retry. log_context=%s",
                    self._log_context,
                    exc_info=True,
                )
            except Exception:
                self._logger.warning(
                    "Unexpected error while renewing DB migration lock; will retry. log_context=%s",
                    self._log_context,
                    exc_info=True,
                )

    def release_safely(self, *, status: str | None = None) -> None:
        """
        Stop heartbeat and release lock. Never raises.

        Args:
            status: Optional caller-provided status (e.g. 'successful'/'failed') to add context to logs.
        """
        lock = self._lock
        if lock is None:
            return

        self._stop_heartbeat()

        # Lock release errors should never mask the real error/exit code.
        try:
            lock.release()
        except LockNotOwnedError:
            self._logger.warning(
                "DB migration lock not owned on release; ignoring. status=%s log_context=%s",
                status,
                self._log_context,
                exc_info=True,
            )
        except RedisError:
            self._logger.warning(
                "Failed to release DB migration lock due to Redis error; ignoring. status=%s log_context=%s",
                status,
                self._log_context,
                exc_info=True,
            )
        except Exception:
            self._logger.warning(
                "Unexpected error while releasing DB migration lock; ignoring. status=%s log_context=%s",
                status,
                self._log_context,
                exc_info=True,
            )
        finally:
            self._acquired = False
            self._lock = None

    def _stop_heartbeat(self) -> None:
        if self._stop_event is None:
            return
        self._stop_event.set()
        if self._thread is not None:
            # Best-effort join: if Redis calls are blocked, the daemon thread may remain alive.
            join_timeout_seconds = max(
                MIN_JOIN_TIMEOUT_SECONDS,
                min(MAX_JOIN_TIMEOUT_SECONDS, self._renew_interval_seconds * JOIN_TIMEOUT_MULTIPLIER),
            )
            self._thread.join(timeout=join_timeout_seconds)
            if self._thread.is_alive():
                self._logger.warning(
                    "DB migration lock heartbeat thread did not stop within %.2fs; ignoring. log_context=%s",
                    join_timeout_seconds,
                    self._log_context,
                )
        self._stop_event = None
        self._thread = None