feat(api): enhance database migration locking mechanism and configuration

- Introduced a configurable Redis lock TTL for database migrations in DeploymentConfig.
- Updated the upgrade_db command to handle lock release errors gracefully.
- Added documentation for the new MIGRATION_LOCK_TTL environment variable in the .env.example file and docker-compose.yaml.

(cherry picked from commit 4a05fb120622908bc109a3715686706aab3d3b59)
This commit is contained in:
L1nSn0w 2026-02-13 12:15:55 +08:00
parent 1977e68b2d
commit 9acdfbde2f
5 changed files with 104 additions and 3 deletions

View File

@ -10,6 +10,7 @@ import click
import sqlalchemy as sa
from flask import current_app
from pydantic import TypeAdapter
from redis.exceptions import LockNotOwnedError, RedisError
from sqlalchemy import select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import sessionmaker
@ -727,7 +728,7 @@ def create_tenant(email: str, language: str | None = None, name: str | None = No
@click.command("upgrade-db", help="Upgrade the database")
def upgrade_db():
click.echo("Preparing database migration...")
lock = redis_client.lock(name="db_upgrade_lock", timeout=60)
lock = redis_client.lock(name="db_upgrade_lock", timeout=dify_config.MIGRATION_LOCK_TTL)
if lock.acquire(blocking=False):
try:
click.echo(click.style("Starting database migration.", fg="green"))
@ -744,7 +745,13 @@ def upgrade_db():
click.echo(click.style(f"Database migration failed: {e}", fg="red"))
raise SystemExit(1)
finally:
lock.release()
# Lock release errors should never mask the real migration failure.
try:
lock.release()
except LockNotOwnedError:
logger.warning("DB migration lock not owned on release (likely expired); ignoring.")
except RedisError:
logger.warning("Failed to release DB migration lock due to Redis error; ignoring.", exc_info=True)
else:
click.echo("Database migration skipped")

View File

@ -1,4 +1,4 @@
from pydantic import Field
from pydantic import Field, PositiveInt
from pydantic_settings import BaseSettings
@ -32,3 +32,8 @@ class DeploymentConfig(BaseSettings):
description="Deployment environment (e.g., 'PRODUCTION', 'DEVELOPMENT'), default to PRODUCTION",
default="PRODUCTION",
)
MIGRATION_LOCK_TTL: PositiveInt = Field(
description="Redis lock TTL for startup DB migration (seconds). Increase for large/slow databases.",
default=3600,
)

View File

@ -0,0 +1,84 @@
import sys
import types
from unittest.mock import MagicMock
import commands
from configs import dify_config
def _install_fake_flask_migrate(monkeypatch, upgrade_impl) -> None:
module = types.ModuleType("flask_migrate")
module.upgrade = upgrade_impl
monkeypatch.setitem(sys.modules, "flask_migrate", module)
def _invoke_upgrade_db() -> int:
try:
commands.upgrade_db.callback()
except SystemExit as e:
return int(e.code or 0)
return 0
def test_upgrade_db_skips_when_lock_not_acquired(monkeypatch, capsys):
monkeypatch.setattr(dify_config, "MIGRATION_LOCK_TTL", 1234)
lock = MagicMock()
lock.acquire.return_value = False
commands.redis_client.lock.return_value = lock
exit_code = _invoke_upgrade_db()
captured = capsys.readouterr()
assert exit_code == 0
assert "Database migration skipped" in captured.out
commands.redis_client.lock.assert_called_once_with(name="db_upgrade_lock", timeout=1234)
lock.acquire.assert_called_once_with(blocking=False)
lock.release.assert_not_called()
def test_upgrade_db_failure_not_masked_by_lock_release(monkeypatch, capsys):
monkeypatch.setattr(dify_config, "MIGRATION_LOCK_TTL", 321)
lock = MagicMock()
lock.acquire.return_value = True
lock.release.side_effect = commands.LockNotOwnedError("simulated")
commands.redis_client.lock.return_value = lock
def _upgrade():
raise RuntimeError("boom")
_install_fake_flask_migrate(monkeypatch, _upgrade)
exit_code = _invoke_upgrade_db()
captured = capsys.readouterr()
assert exit_code == 1
assert "Database migration failed: boom" in captured.out
commands.redis_client.lock.assert_called_once_with(name="db_upgrade_lock", timeout=321)
lock.acquire.assert_called_once_with(blocking=False)
lock.release.assert_called_once()
def test_upgrade_db_success_ignores_lock_not_owned_on_release(monkeypatch, capsys):
monkeypatch.setattr(dify_config, "MIGRATION_LOCK_TTL", 999)
lock = MagicMock()
lock.acquire.return_value = True
lock.release.side_effect = commands.LockNotOwnedError("simulated")
commands.redis_client.lock.return_value = lock
_install_fake_flask_migrate(monkeypatch, lambda: None)
exit_code = _invoke_upgrade_db()
captured = capsys.readouterr()
assert exit_code == 0
assert "Database migration successful!" in captured.out
commands.redis_client.lock.assert_called_once_with(name="db_upgrade_lock", timeout=999)
lock.acquire.assert_called_once_with(blocking=False)
lock.release.assert_called_once()

View File

@ -125,6 +125,10 @@ OPENAI_API_BASE=https://api.openai.com/v1
# and the application will start after the migrations have completed.
MIGRATION_ENABLED=true
# Redis lock TTL (in seconds) for startup database migrations.
# Increase this value for long-running migrations to avoid concurrent upgrades in multi-replica deployments.
MIGRATION_LOCK_TTL=3600
# File Access Time specifies a time interval in seconds for the file to be accessed.
# The default value is 300 seconds.
FILES_ACCESS_TIMEOUT=300

View File

@ -32,6 +32,7 @@ x-shared-env: &shared-api-worker-env
CHECK_UPDATE_URL: ${CHECK_UPDATE_URL:-https://updates.dify.ai}
OPENAI_API_BASE: ${OPENAI_API_BASE:-https://api.openai.com/v1}
MIGRATION_ENABLED: ${MIGRATION_ENABLED:-true}
MIGRATION_LOCK_TTL: ${MIGRATION_LOCK_TTL:-3600}
FILES_ACCESS_TIMEOUT: ${FILES_ACCESS_TIMEOUT:-300}
ACCESS_TOKEN_EXPIRE_MINUTES: ${ACCESS_TOKEN_EXPIRE_MINUTES:-60}
REFRESH_TOKEN_EXPIRE_DAYS: ${REFRESH_TOKEN_EXPIRE_DAYS:-30}