feat(api): Implement EventManager error logging and add coverage (#29204)

- Ensure `EventManager._notify_layers` logs exceptions instead of silently swallowing them so GraphEngine layer failures surface for debugging - Introduce unit tests to assert the logger captures the runtime error when collecting events - Enable the `S110` lint rule to catch `try-except-pass` patterns - Add proper error logging for existing `try-except-pass` blocks.
2025-12-08 09:40:40 +08:00 · 2025-12-08 09:40:40 +08:00 · 91667e3c1d
parent a25faa334a
commit 91667e3c1d
15 changed files with 103 additions and 33 deletions
--- a/api/.ruff.toml
+++ b/api/.ruff.toml
@ -36,17 +36,20 @@ select = [
    "UP",      # pyupgrade rules
    "W191",    # tab-indentation
    "W605",    # invalid-escape-sequence
+    "G001",    # don't use str format to logging messages
+    "G003",    # don't use + in logging messages
+    "G004",    # don't use f-strings to format logging messages
+    "UP042",   # use StrEnum,
+    "S110",    # disallow the try-except-pass pattern.
+
    # security related linting rules
    # RCE proctection (sort of)
    "S102", # exec-builtin, disallow use of `exec`
    "S307", # suspicious-eval-usage, disallow use of `eval` and `ast.literal_eval`
    "S301", # suspicious-pickle-usage, disallow use of `pickle` and its wrappers.
    "S302", # suspicious-marshal-usage, disallow use of `marshal` module
-    "S311", # suspicious-non-cryptographic-random-usage
-    "G001", # don't use str format to logging messages
-    "G003", # don't use + in logging messages
-    "G004", # don't use f-strings to format logging messages
-    "UP042", # use StrEnum
+    "S311", # suspicious-non-cryptographic-random-usage,
+
 ]

 ignore = [
@ -91,18 +94,16 @@ ignore = [
 "configs/*" = [
    "N802", # invalid-function-name
 ]
-"core/model_runtime/callbacks/base_callback.py" = [
-    "T201",
-]
-"core/workflow/callbacks/workflow_logging_callback.py" = [
-    "T201",
-]
+"core/model_runtime/callbacks/base_callback.py" = ["T201"]
+"core/workflow/callbacks/workflow_logging_callback.py" = ["T201"]
 "libs/gmpy2_pkcs10aep_cipher.py" = [
    "N803", # invalid-argument-name
 ]
 "tests/*" = [
    "F811", # redefined-while-unused
-    "T201", # allow print in tests
+    "T201", # allow print in tests,
+    "S110", # allow ignoring exceptions in tests code (currently)
+
 ]

 [lint.pyflakes]
--- a/api/controllers/service_api/wraps.py
+++ b/api/controllers/service_api/wraps.py
@ -1,3 +1,4 @@
+import logging
 import time
 from collections.abc import Callable
 from datetime import timedelta
@ -28,6 +29,8 @@ P = ParamSpec("P")
 R = TypeVar("R")
 T = TypeVar("T")

+logger = logging.getLogger(__name__)
+

 class WhereisUserArg(StrEnum):
    """
@ -238,8 +241,8 @@ def validate_dataset_token(view: Callable[Concatenate[T, P], R] | None = None):
                        # Basic check: UUIDs are 36 chars with hyphens
                        if len(str_id) == 36 and str_id.count("-") == 4:
                            dataset_id = str_id
-                    except:
-                        pass
+                    except Exception:
+                        logger.exception("Failed to parse dataset_id from class method args")
                elif len(args) > 0:
                    # Not a class method, check if args[0] looks like a UUID
                    potential_id = args[0]
@ -247,8 +250,8 @@ def validate_dataset_token(view: Callable[Concatenate[T, P], R] | None = None):
                        str_id = str(potential_id)
                        if len(str_id) == 36 and str_id.count("-") == 4:
                            dataset_id = str_id
-                    except:
-                        pass
+                    except Exception:
+                        logger.exception("Failed to parse dataset_id from positional args")

            # Validate dataset if dataset_id is provided
            if dataset_id:
--- a/api/core/agent/cot_agent_runner.py
+++ b/api/core/agent/cot_agent_runner.py
@ -1,4 +1,5 @@
 import json
+import logging
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Mapping, Sequence
 from typing import Any
@ -23,6 +24,8 @@ from core.tools.entities.tool_entities import ToolInvokeMeta
 from core.tools.tool_engine import ToolEngine
 from models.model import Message

+logger = logging.getLogger(__name__)
+

 class CotAgentRunner(BaseAgentRunner, ABC):
    _is_first_iteration = True
@ -400,8 +403,8 @@ class CotAgentRunner(BaseAgentRunner, ABC):
                            action_input=json.loads(message.tool_calls[0].function.arguments),
                        )
                        current_scratchpad.action_str = json.dumps(current_scratchpad.action.to_dict())
-                    except:
-                        pass
+                    except Exception:
+                        logger.exception("Failed to parse tool call from assistant message")
            elif isinstance(message, ToolPromptMessage):
                if current_scratchpad:
                    assert isinstance(message.content, str)
--- a/api/core/entities/provider_configuration.py
+++ b/api/core/entities/provider_configuration.py
@ -253,7 +253,7 @@ class ProviderConfiguration(BaseModel):
                try:
                    credentials[key] = encrypter.decrypt_token(tenant_id=self.tenant_id, token=credentials[key])
                except Exception:
-                    pass
+                    logger.exception("Failed to decrypt credential secret variable %s", key)

        return self.obfuscated_credentials(
            credentials=credentials,
@ -765,7 +765,7 @@ class ProviderConfiguration(BaseModel):
                try:
                    credentials[key] = encrypter.decrypt_token(tenant_id=self.tenant_id, token=credentials[key])
                except Exception:
-                    pass
+                    logger.exception("Failed to decrypt model credential secret variable %s", key)

        current_credential_id = credential_record.id
        current_credential_name = credential_record.credential_name
--- a/api/core/helper/marketplace.py
+++ b/api/core/helper/marketplace.py
@ -1,3 +1,4 @@
+import logging
 from collections.abc import Sequence

 import httpx
@ -8,6 +9,7 @@ from core.helper.download import download_with_size_limit
 from core.plugin.entities.marketplace import MarketplacePluginDeclaration

 marketplace_api_url = URL(str(dify_config.MARKETPLACE_API_URL))
+logger = logging.getLogger(__name__)


 def get_plugin_pkg_url(plugin_unique_identifier: str) -> str:
@ -55,7 +57,9 @@ def batch_fetch_plugin_manifests_ignore_deserialization_error(
        try:
            result.append(MarketplacePluginDeclaration.model_validate(plugin))
        except Exception:
-            pass
+            logger.exception(
+                "Failed to deserialize marketplace plugin manifest for %s", plugin.get("plugin_id", "unknown")
+            )

    return result

--- a/api/core/ops/tencent_trace/tencent_trace.py
+++ b/api/core/ops/tencent_trace/tencent_trace.py
@ -521,4 +521,4 @@ class TencentDataTrace(BaseTraceInstance):
            if hasattr(self, "trace_client"):
                self.trace_client.shutdown()
        except Exception:
-            pass
+            logger.exception("[Tencent APM] Failed to shutdown trace client during cleanup")
--- a/api/core/tools/tool_manager.py
+++ b/api/core/tools/tool_manager.py
@ -723,7 +723,7 @@ class ToolManager:
                        )
                    except Exception:
                        # app has been deleted
-                        pass
+                        logger.exception("Failed to transform workflow provider %s to controller", workflow_provider.id)

                labels = ToolLabelManager.get_tools_labels(
                    [cast(ToolProviderController, controller) for controller in workflow_provider_controllers]
--- a/api/core/workflow/graph_engine/event_management/event_manager.py
+++ b/api/core/workflow/graph_engine/event_management/event_manager.py
@ -2,6 +2,7 @@
 Unified event manager for collecting and emitting events.
 """

+import logging
 import threading
 import time
 from collections.abc import Generator
@ -12,6 +13,8 @@ from core.workflow.graph_events import GraphEngineEvent

 from ..layers.base import GraphEngineLayer

+_logger = logging.getLogger(__name__)
+

@final
 class ReadWriteLock:
@ -180,5 +183,4 @@ class EventManager:
            try:
                layer.on_event(event)
            except Exception:
-                # Silently ignore layer errors during collection
-                pass
+                _logger.exception("Error in layer on_event, layer_type=%s", type(layer))
--- a/api/core/workflow/graph_engine/manager.py
+++ b/api/core/workflow/graph_engine/manager.py
@ -6,12 +6,15 @@ using the new Redis command channel, without requiring user permission checks.
 Supports stop, pause, and resume operations.
 """

+import logging
 from typing import final

 from core.workflow.graph_engine.command_channels.redis_channel import RedisChannel
 from core.workflow.graph_engine.entities.commands import AbortCommand, GraphEngineCommand, PauseCommand
 from extensions.ext_redis import redis_client

+logger = logging.getLogger(__name__)
+

@final
 class GraphEngineManager:
@ -57,4 +60,4 @@ class GraphEngineManager:
        except Exception:
            # Silently fail if Redis is unavailable
            # The legacy control mechanisms will still work
-            pass
+            logger.exception("Failed to send graph engine command %s for task %s", command.__class__.__name__, task_id)
--- a/api/events/event_handlers/delete_tool_parameters_cache_when_sync_draft_workflow.py
+++ b/api/events/event_handlers/delete_tool_parameters_cache_when_sync_draft_workflow.py
@ -1,9 +1,13 @@
+import logging
+
 from core.tools.tool_manager import ToolManager
 from core.tools.utils.configuration import ToolParameterConfigurationManager
 from core.workflow.nodes import NodeType
 from core.workflow.nodes.tool.entities import ToolEntity
 from events.app_event import app_draft_workflow_was_synced

+logger = logging.getLogger(__name__)
+

@app_draft_workflow_was_synced.connect
 def handle(sender, **kwargs):
@ -30,6 +34,10 @@ def handle(sender, **kwargs):
                    identity_id=f"WORKFLOW.{app.id}.{node_data.get('id')}",
                )
                manager.delete_tool_parameters_cache()
-            except:
+            except Exception:
                # tool dose not exist
-                pass
+                logger.exception(
+                    "Failed to delete tool parameters cache for workflow %s node %s",
+                    app.id,
+                    node_data.get("id"),
+                )
--- a/api/extensions/storage/clickzetta_volume/file_lifecycle.py
+++ b/api/extensions/storage/clickzetta_volume/file_lifecycle.py
@ -199,9 +199,9 @@ class FileLifecycleManager:
                            # Temporarily create basic metadata information
                        except ValueError:
                            continue
-            except:
+            except Exception:
                # If cannot scan version files, only return current version
-                pass
+                logger.exception("Failed to scan version files for %s", filename)

            return sorted(versions, key=lambda x: x.version or 0, reverse=True)

--- a/api/services/app_service.py
+++ b/api/services/app_service.py
@ -211,7 +211,7 @@ class AppService:
                    # override tool parameters
                    tool["tool_parameters"] = masked_parameter
                except Exception:
-                    pass
+                    logger.exception("Failed to mask agent tool parameters for tool %s", agent_tool_entity.tool_name)

            # override agent mode
            if model_config:
--- a/api/services/tools/workflow_tools_manage_service.py
+++ b/api/services/tools/workflow_tools_manage_service.py
@ -1,4 +1,5 @@
 import json
+import logging
 from collections.abc import Mapping
 from datetime import datetime
 from typing import Any
@ -19,6 +20,8 @@ from models.tools import WorkflowToolProvider
 from models.workflow import Workflow
 from services.tools.tools_transform_service import ToolTransformService

+logger = logging.getLogger(__name__)
+

 class WorkflowToolManageService:
    """
@ -198,7 +201,7 @@ class WorkflowToolManageService:
                tools.append(ToolTransformService.workflow_provider_to_controller(provider))
            except Exception:
                # skip deleted tools
-                pass
+                logger.exception("Failed to load workflow tool provider %s", provider.id)

        labels = ToolLabelManager.get_tools_labels([t for t in tools if isinstance(t, ToolProviderController)])

--- a/api/tasks/process_tenant_plugin_autoupgrade_check_task.py
+++ b/api/tasks/process_tenant_plugin_autoupgrade_check_task.py
@ -1,4 +1,5 @@
 import json
+import logging
 import operator
 import typing

@ -12,6 +13,8 @@ from core.plugin.impl.plugin import PluginInstaller
 from extensions.ext_redis import redis_client
 from models.account import TenantPluginAutoUpgradeStrategy

+logger = logging.getLogger(__name__)
+
 RETRY_TIMES_OF_ONE_PLUGIN_IN_ONE_TENANT = 3
 CACHE_REDIS_KEY_PREFIX = "plugin_autoupgrade_check_task:cached_plugin_manifests:"
 CACHE_REDIS_TTL = 60 * 15  # 15 minutes
@ -42,6 +45,7 @@ def _get_cached_manifest(plugin_id: str) -> typing.Union[MarketplacePluginDeclar

        return MarketplacePluginDeclaration.model_validate(cached_json)
    except Exception:
+        logger.exception("Failed to get cached manifest for plugin %s", plugin_id)
        return False


@ -63,7 +67,7 @@ def _set_cached_manifest(plugin_id: str, manifest: typing.Union[MarketplacePlugi
    except Exception:
        # If Redis fails, continue without caching
        # traceback.print_exc()
-        pass
+        logger.exception("Failed to set cached manifest for plugin %s", plugin_id)


 def marketplace_batch_fetch_plugin_manifests(
--- a/api/tests/unit_tests/core/workflow/graph_engine/event_management/test_event_manager.py
+++ b/api/tests/unit_tests/core/workflow/graph_engine/event_management/test_event_manager.py
@ -0,0 +1,39 @@
+"""Tests for the EventManager."""
+
+from __future__ import annotations
+
+import logging
+
+from core.workflow.graph_engine.event_management.event_manager import EventManager
+from core.workflow.graph_engine.layers.base import GraphEngineLayer
+from core.workflow.graph_events import GraphEngineEvent
+
+
+class _FaultyLayer(GraphEngineLayer):
+    """Layer that raises from on_event to test error handling."""
+
+    def on_graph_start(self) -> None:  # pragma: no cover - not used in tests
+        pass
+
+    def on_event(self, event: GraphEngineEvent) -> None:
+        raise RuntimeError("boom")
+
+    def on_graph_end(self, error: Exception | None) -> None:  # pragma: no cover - not used in tests
+        pass
+
+
+def test_event_manager_logs_layer_errors(caplog) -> None:
+    """Ensure errors raised by layers are logged when collecting events."""
+
+    event_manager = EventManager()
+    event_manager.set_layers([_FaultyLayer()])
+
+    with caplog.at_level(logging.ERROR):
+        event_manager.collect(GraphEngineEvent())
+
+    error_logs = [record for record in caplog.records if "Error in layer on_event" in record.getMessage()]
+    assert error_logs, "Expected layer errors to be logged"
+
+    log_record = error_logs[0]
+    assert log_record.exc_info is not None
+    assert isinstance(log_record.exc_info[1], RuntimeError)