From dabad463931941a887ba38bae2571f7b37fa9937 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Mon, 9 Mar 2026 15:56:03 +0800 Subject: [PATCH] evaluation runtime --- .../runners/agent_evaluation_runner.py | 10 ++++-- .../runners/base_evaluation_runner.py | 32 +++++++++---------- .../runners/llm_evaluation_runner.py | 10 ++++-- .../runners/retrieval_evaluation_runner.py | 10 ++++-- .../runners/snippet_evaluation_runner.py | 10 ++++-- .../runners/workflow_evaluation_runner.py | 10 ++++-- api/tasks/evaluation_task.py | 19 +++++++---- 7 files changed, 63 insertions(+), 38 deletions(-) diff --git a/api/core/evaluation/runners/agent_evaluation_runner.py b/api/core/evaluation/runners/agent_evaluation_runner.py index 90ed0a2590..5c03c376dd 100644 --- a/api/core/evaluation/runners/agent_evaluation_runner.py +++ b/api/core/evaluation/runners/agent_evaluation_runner.py @@ -6,10 +6,13 @@ from sqlalchemy.orm import Session from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.evaluation_entity import ( + CustomizedMetrics, + DefaultMetric, EvaluationItemInput, EvaluationItemResult, ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner +from core.workflow.node_events import NodeRunResult from models.model import App logger = logging.getLogger(__name__) @@ -66,9 +69,10 @@ class AgentEvaluationRunner(BaseEvaluationRunner): def evaluate_metrics( self, - items: list[EvaluationItemInput], - results: list[EvaluationItemResult], - default_metrics: list[dict[str, Any]], + node_run_result_mapping: dict[str, NodeRunResult] | None, + node_run_result: NodeRunResult | None, + default_metric: DefaultMetric | None, + customized_metrics: CustomizedMetrics | None, model_provider: str, model_name: str, tenant_id: str, diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py index d851ed6401..1c57f9c99e 100644 --- a/api/core/evaluation/runners/base_evaluation_runner.py +++ b/api/core/evaluation/runners/base_evaluation_runner.py @@ -42,9 +42,10 @@ class BaseEvaluationRunner(ABC): @abstractmethod def evaluate_metrics( self, - items: list[EvaluationItemInput], - results: list[EvaluationItemResult], - default_metrics: list[dict[str, Any]], + node_run_result_mapping: dict[str, NodeRunResult] | None, + node_run_result: NodeRunResult | None, + default_metric: DefaultMetric | None, + customized_metrics: CustomizedMetrics | None, model_provider: str, model_name: str, tenant_id: str, @@ -58,11 +59,12 @@ class BaseEvaluationRunner(ABC): tenant_id: str, target_id: str, target_type: str, - node_run_result: NodeRunResult, + node_run_result: NodeRunResult | None = None, default_metric: DefaultMetric | None = None, customized_metrics: CustomizedMetrics | None = None, model_provider: str = "", - model_name: str = "", + model_name: str = "", + node_run_result_mapping: dict[str, NodeRunResult] | None = None, ) -> list[EvaluationItemResult]: """Orchestrate target execution + metric evaluation + judgment for all items.""" evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first() @@ -82,17 +84,15 @@ class BaseEvaluationRunner(ABC): # Phase 1: run evaluation if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED: try: - if customized_metrics is not None: - # Customized workflow evaluation — target-type agnostic - evaluated_results = self._evaluate_customized( - successful_items, successful_results, customized_metrics, tenant_id, - ) - else: - # Framework-specific evaluation — delegate to subclass - evaluated_results = self.evaluate_metrics( - successful_items, successful_results, default_metrics, - model_provider, model_name, tenant_id, - ) + evaluated_results = self.evaluate_metrics( + node_run_result_mapping=node_run_result_mapping, + node_run_result=node_run_result, + default_metric=default_metric, + customized_metrics=customized_metrics, + model_provider=model_provider, + model_name=model_name, + tenant_id=tenant_id, + ) # Merge evaluated metrics back into results evaluated_by_index = {r.index: r for r in evaluated_results} for i, result in enumerate(results): diff --git a/api/core/evaluation/runners/llm_evaluation_runner.py b/api/core/evaluation/runners/llm_evaluation_runner.py index aa746751ca..b34d57f9f6 100644 --- a/api/core/evaluation/runners/llm_evaluation_runner.py +++ b/api/core/evaluation/runners/llm_evaluation_runner.py @@ -6,10 +6,13 @@ from sqlalchemy.orm import Session from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.evaluation_entity import ( + CustomizedMetrics, + DefaultMetric, EvaluationItemInput, EvaluationItemResult, ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner +from core.workflow.node_events import NodeRunResult from models.model import App, AppMode logger = logging.getLogger(__name__) @@ -23,9 +26,10 @@ class LLMEvaluationRunner(BaseEvaluationRunner): def evaluate_metrics( self, - items: list[EvaluationItemInput], - results: list[EvaluationItemResult], - default_metrics: list[dict[str, Any]], + node_run_result_mapping: dict[str, NodeRunResult] | None, + node_run_result: NodeRunResult | None, + default_metric: DefaultMetric | None, + customized_metrics: CustomizedMetrics | None, model_provider: str, model_name: str, tenant_id: str, diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py index 57cffd2e9e..44bf67ea34 100644 --- a/api/core/evaluation/runners/retrieval_evaluation_runner.py +++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py @@ -5,10 +5,13 @@ from sqlalchemy.orm import Session from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.evaluation_entity import ( + CustomizedMetrics, + DefaultMetric, EvaluationItemInput, EvaluationItemResult, ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner +from core.workflow.node_events import NodeRunResult logger = logging.getLogger(__name__) @@ -21,9 +24,10 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner): def evaluate_metrics( self, - items: list[EvaluationItemInput], - results: list[EvaluationItemResult], - default_metrics: list[dict[str, Any]], + node_run_result_mapping: dict[str, NodeRunResult] | None, + node_run_result: NodeRunResult | None, + default_metric: DefaultMetric | None, + customized_metrics: CustomizedMetrics | None, model_provider: str, model_name: str, tenant_id: str, diff --git a/api/core/evaluation/runners/snippet_evaluation_runner.py b/api/core/evaluation/runners/snippet_evaluation_runner.py index 5b763c7031..b7e16f9772 100644 --- a/api/core/evaluation/runners/snippet_evaluation_runner.py +++ b/api/core/evaluation/runners/snippet_evaluation_runner.py @@ -16,10 +16,13 @@ from sqlalchemy.orm import Session from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.evaluation_entity import ( + CustomizedMetrics, + DefaultMetric, EvaluationItemInput, EvaluationItemResult, ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner +from core.workflow.node_events import NodeRunResult from models.snippet import CustomizedSnippet from models.workflow import WorkflowNodeExecutionModel @@ -89,9 +92,10 @@ class SnippetEvaluationRunner(BaseEvaluationRunner): def evaluate_metrics( self, - items: list[EvaluationItemInput], - results: list[EvaluationItemResult], - default_metrics: list[dict[str, Any]], + node_run_result_mapping: dict[str, NodeRunResult] | None, + node_run_result: NodeRunResult | None, + default_metric: DefaultMetric | None, + customized_metrics: CustomizedMetrics | None, model_provider: str, model_name: str, tenant_id: str, diff --git a/api/core/evaluation/runners/workflow_evaluation_runner.py b/api/core/evaluation/runners/workflow_evaluation_runner.py index dc968b93b7..2fcab86ef0 100644 --- a/api/core/evaluation/runners/workflow_evaluation_runner.py +++ b/api/core/evaluation/runners/workflow_evaluation_runner.py @@ -6,10 +6,13 @@ from sqlalchemy.orm import Session from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.evaluation_entity import ( + CustomizedMetrics, + DefaultMetric, EvaluationItemInput, EvaluationItemResult, ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner +from core.workflow.node_events import NodeRunResult from models.model import App logger = logging.getLogger(__name__) @@ -23,9 +26,10 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner): def evaluate_metrics( self, - items: list[EvaluationItemInput], - results: list[EvaluationItemResult], - default_metrics: list[dict[str, Any]], + node_run_result_mapping: dict[str, NodeRunResult] | None, + node_run_result: NodeRunResult | None, + default_metric: DefaultMetric | None, + customized_metrics: CustomizedMetrics | None, model_provider: str, model_name: str, tenant_id: str, diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index a7d7fc3724..2304b01586 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -129,13 +129,18 @@ def _execute_evaluation_runner( ) else: default_metric.score = 0 - for customized_metric in customized_metrics: - runner = _create_runner(run_data.evaluation_category, evaluation_instance, session) - runner.run( - evaluation_run_id=run_data.evaluation_run_id, - tenant_id=run_data.tenant_id, - target_id=run_data.target_id, - ) + if customized_metrics: + runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session) + runner.run( + evaluation_run_id=run_data.evaluation_run_id, + tenant_id=run_data.tenant_id, + target_id=run_data.target_id, + target_type=run_data.target_type, + default_metric=None, + customized_metrics=customized_metrics, + node_run_result=None, + node_run_result_mapping=node_run_result_mapping, + ) def _create_runner( category: EvaluationCategory,