From 9acdfbde2f15b19e90d48a9fed115361aebb5b24 Mon Sep 17 00:00:00 2001 From: L1nSn0w Date: Fri, 13 Feb 2026 12:15:55 +0800 Subject: [PATCH] feat(api): enhance database migration locking mechanism and configuration - Introduced a configurable Redis lock TTL for database migrations in DeploymentConfig. - Updated the upgrade_db command to handle lock release errors gracefully. - Added documentation for the new MIGRATION_LOCK_TTL environment variable in the .env.example file and docker-compose.yaml. (cherry picked from commit 4a05fb120622908bc109a3715686706aab3d3b59) --- api/commands.py | 11 ++- api/configs/deploy/__init__.py | 7 +- .../unit_tests/commands/test_upgrade_db.py | 84 +++++++++++++++++++ docker/.env.example | 4 + docker/docker-compose.yaml | 1 + 5 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 api/tests/unit_tests/commands/test_upgrade_db.py diff --git a/api/commands.py b/api/commands.py index 93855bc3b8..fbf16de8be 100644 --- a/api/commands.py +++ b/api/commands.py @@ -10,6 +10,7 @@ import click import sqlalchemy as sa from flask import current_app from pydantic import TypeAdapter +from redis.exceptions import LockNotOwnedError, RedisError from sqlalchemy import select from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.orm import sessionmaker @@ -727,7 +728,7 @@ def create_tenant(email: str, language: str | None = None, name: str | None = No @click.command("upgrade-db", help="Upgrade the database") def upgrade_db(): click.echo("Preparing database migration...") - lock = redis_client.lock(name="db_upgrade_lock", timeout=60) + lock = redis_client.lock(name="db_upgrade_lock", timeout=dify_config.MIGRATION_LOCK_TTL) if lock.acquire(blocking=False): try: click.echo(click.style("Starting database migration.", fg="green")) @@ -744,7 +745,13 @@ def upgrade_db(): click.echo(click.style(f"Database migration failed: {e}", fg="red")) raise SystemExit(1) finally: - lock.release() + # Lock release errors should never mask the real migration failure. + try: + lock.release() + except LockNotOwnedError: + logger.warning("DB migration lock not owned on release (likely expired); ignoring.") + except RedisError: + logger.warning("Failed to release DB migration lock due to Redis error; ignoring.", exc_info=True) else: click.echo("Database migration skipped") diff --git a/api/configs/deploy/__init__.py b/api/configs/deploy/__init__.py index 63f4dfba63..4ac57f0370 100644 --- a/api/configs/deploy/__init__.py +++ b/api/configs/deploy/__init__.py @@ -1,4 +1,4 @@ -from pydantic import Field +from pydantic import Field, PositiveInt from pydantic_settings import BaseSettings @@ -32,3 +32,8 @@ class DeploymentConfig(BaseSettings): description="Deployment environment (e.g., 'PRODUCTION', 'DEVELOPMENT'), default to PRODUCTION", default="PRODUCTION", ) + + MIGRATION_LOCK_TTL: PositiveInt = Field( + description="Redis lock TTL for startup DB migration (seconds). Increase for large/slow databases.", + default=3600, + ) diff --git a/api/tests/unit_tests/commands/test_upgrade_db.py b/api/tests/unit_tests/commands/test_upgrade_db.py new file mode 100644 index 0000000000..c262ef71cc --- /dev/null +++ b/api/tests/unit_tests/commands/test_upgrade_db.py @@ -0,0 +1,84 @@ +import sys +import types +from unittest.mock import MagicMock + +import commands +from configs import dify_config + + +def _install_fake_flask_migrate(monkeypatch, upgrade_impl) -> None: + module = types.ModuleType("flask_migrate") + module.upgrade = upgrade_impl + monkeypatch.setitem(sys.modules, "flask_migrate", module) + + +def _invoke_upgrade_db() -> int: + try: + commands.upgrade_db.callback() + except SystemExit as e: + return int(e.code or 0) + return 0 + + +def test_upgrade_db_skips_when_lock_not_acquired(monkeypatch, capsys): + monkeypatch.setattr(dify_config, "MIGRATION_LOCK_TTL", 1234) + + lock = MagicMock() + lock.acquire.return_value = False + commands.redis_client.lock.return_value = lock + + exit_code = _invoke_upgrade_db() + captured = capsys.readouterr() + + assert exit_code == 0 + assert "Database migration skipped" in captured.out + + commands.redis_client.lock.assert_called_once_with(name="db_upgrade_lock", timeout=1234) + lock.acquire.assert_called_once_with(blocking=False) + lock.release.assert_not_called() + + +def test_upgrade_db_failure_not_masked_by_lock_release(monkeypatch, capsys): + monkeypatch.setattr(dify_config, "MIGRATION_LOCK_TTL", 321) + + lock = MagicMock() + lock.acquire.return_value = True + lock.release.side_effect = commands.LockNotOwnedError("simulated") + commands.redis_client.lock.return_value = lock + + def _upgrade(): + raise RuntimeError("boom") + + _install_fake_flask_migrate(monkeypatch, _upgrade) + + exit_code = _invoke_upgrade_db() + captured = capsys.readouterr() + + assert exit_code == 1 + assert "Database migration failed: boom" in captured.out + + commands.redis_client.lock.assert_called_once_with(name="db_upgrade_lock", timeout=321) + lock.acquire.assert_called_once_with(blocking=False) + lock.release.assert_called_once() + + +def test_upgrade_db_success_ignores_lock_not_owned_on_release(monkeypatch, capsys): + monkeypatch.setattr(dify_config, "MIGRATION_LOCK_TTL", 999) + + lock = MagicMock() + lock.acquire.return_value = True + lock.release.side_effect = commands.LockNotOwnedError("simulated") + commands.redis_client.lock.return_value = lock + + _install_fake_flask_migrate(monkeypatch, lambda: None) + + exit_code = _invoke_upgrade_db() + captured = capsys.readouterr() + + assert exit_code == 0 + assert "Database migration successful!" in captured.out + + commands.redis_client.lock.assert_called_once_with(name="db_upgrade_lock", timeout=999) + lock.acquire.assert_called_once_with(blocking=False) + lock.release.assert_called_once() + diff --git a/docker/.env.example b/docker/.env.example index 41a0205bf5..9339404b58 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -125,6 +125,10 @@ OPENAI_API_BASE=https://api.openai.com/v1 # and the application will start after the migrations have completed. MIGRATION_ENABLED=true +# Redis lock TTL (in seconds) for startup database migrations. +# Increase this value for long-running migrations to avoid concurrent upgrades in multi-replica deployments. +MIGRATION_LOCK_TTL=3600 + # File Access Time specifies a time interval in seconds for the file to be accessed. # The default value is 300 seconds. FILES_ACCESS_TIMEOUT=300 diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 1886f848e0..57a0c089c8 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -32,6 +32,7 @@ x-shared-env: &shared-api-worker-env CHECK_UPDATE_URL: ${CHECK_UPDATE_URL:-https://updates.dify.ai} OPENAI_API_BASE: ${OPENAI_API_BASE:-https://api.openai.com/v1} MIGRATION_ENABLED: ${MIGRATION_ENABLED:-true} + MIGRATION_LOCK_TTL: ${MIGRATION_LOCK_TTL:-3600} FILES_ACCESS_TIMEOUT: ${FILES_ACCESS_TIMEOUT:-300} ACCESS_TOKEN_EXPIRE_MINUTES: ${ACCESS_TOKEN_EXPIRE_MINUTES:-60} REFRESH_TOKEN_EXPIRE_DAYS: ${REFRESH_TOKEN_EXPIRE_DAYS:-30}