diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 7d84e02c15..53138d253c 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -24,6 +24,7 @@ class EvaluationMetric(BaseModel): class EvaluationItemInput(BaseModel): index: int inputs: dict[str, Any] + output: str expected_output: str | None = None context: list[str] | None = None @@ -89,4 +90,4 @@ class EvaluationRunData(BaseModel): default_metrics: list[DefaultMetric] = Field(default_factory=list) customized_metrics: CustomizedMetrics | None = None judgment_config: JudgmentConfig | None = None - input_list: list[dict] + input_list: list[EvaluationItemInput] diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py index 1c57f9c99e..e75748ed17 100644 --- a/api/core/evaluation/runners/base_evaluation_runner.py +++ b/api/core/evaluation/runners/base_evaluation_runner.py @@ -42,8 +42,8 @@ class BaseEvaluationRunner(ABC): @abstractmethod def evaluate_metrics( self, - node_run_result_mapping: dict[str, NodeRunResult] | None, - node_run_result: NodeRunResult | None, + node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None, + node_run_result_list: list[NodeRunResult] | None, default_metric: DefaultMetric | None, customized_metrics: CustomizedMetrics | None, model_provider: str, @@ -59,12 +59,12 @@ class BaseEvaluationRunner(ABC): tenant_id: str, target_id: str, target_type: str, - node_run_result: NodeRunResult | None = None, + node_run_result_list: list[NodeRunResult] | None = None, default_metric: DefaultMetric | None = None, customized_metrics: CustomizedMetrics | None = None, model_provider: str = "", model_name: str = "", - node_run_result_mapping: dict[str, NodeRunResult] | None = None, + node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None = None, ) -> list[EvaluationItemResult]: """Orchestrate target execution + metric evaluation + judgment for all items.""" evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first() @@ -82,11 +82,11 @@ class BaseEvaluationRunner(ABC): results: list[EvaluationItemResult] = [] # Phase 1: run evaluation - if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED: + if default_metric and node_run_result_list: try: evaluated_results = self.evaluate_metrics( - node_run_result_mapping=node_run_result_mapping, - node_run_result=node_run_result, + node_run_result_mapping_list=node_run_result_mapping_list, + node_run_result_list=node_run_result_list, default_metric=default_metric, customized_metrics=customized_metrics, model_provider=model_provider, @@ -100,6 +100,19 @@ class BaseEvaluationRunner(ABC): results[i] = evaluated_by_index[result.index] except Exception: logger.exception("Failed to compute metrics for evaluation run %s", evaluation_run_id) + if customized_metrics and node_run_result_mapping_list: + try: + evaluated_results = self.evaluate_metrics( + node_run_result_mapping_list=node_run_result_mapping_list, + node_run_result_list=node_run_result_list, + default_metric=default_metric, + customized_metrics=customized_metrics, + model_provider=model_provider, + model_name=model_name, + tenant_id=tenant_id, + ) + except Exception: + logger.exception("Failed to compute metrics for evaluation run %s", evaluation_run_id) # Phase 4: Persist individual items for result in results: diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 2304b01586..8c08c09c7b 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -27,6 +27,7 @@ from extensions.ext_database import db from libs.datetime_utils import naive_utc_now from models.evaluation import EvaluationRun, EvaluationRunStatus from models.model import UploadFile +from services.evaluation_service import EvaluationService logger = logging.getLogger(__name__) @@ -76,7 +77,20 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: if evaluation_instance is None: raise ValueError("Evaluation framework not configured") - _execute_evaluation_runner(session, run_data, evaluation_instance, node_run_result_mapping) + evaluation_service = EvaluationService() + node_run_result_mapping_list: list[dict[str, NodeRunResult]] = evaluation_service.execute_targets( + tenant_id=run_data.tenant_id, + target_type=run_data.target_type, + target_id=run_data.target_id, + input_list=run_data.input_list, + ) + + results: list[EvaluationItemResult] = _execute_evaluation_runner( + session, + run_data, + evaluation_instance, + node_run_result_mapping_list, + ) # Compute summary metrics @@ -106,15 +120,19 @@ def _execute_evaluation_runner( session: Any, run_data: EvaluationRunData, evaluation_instance: BaseEvaluationInstance, - node_run_result_mapping: dict[str, NodeRunResult], + node_run_result_mapping_list: list[dict[str, NodeRunResult]], ) -> list[EvaluationItemResult]: """Execute the evaluation runner.""" default_metrics = run_data.default_metrics customized_metrics = run_data.customized_metrics for default_metric in default_metrics: for node_info in default_metric.node_info_list: - node_run_result = node_run_result_mapping.get(node_info.node_id) - if node_run_result: + node_run_result_list: list[NodeRunResult] = [] + for node_run_result_mapping in node_run_result_mapping_list: + node_run_result = node_run_result_mapping.get(node_info.node_id) + if node_run_result is not None: + node_run_result_list.append(node_run_result) + if node_run_result_list: runner = _create_runner(EvaluationCategory(node_info.type), evaluation_instance, session) runner.run( evaluation_run_id=run_data.evaluation_run_id, @@ -125,10 +143,8 @@ def _execute_evaluation_runner( customized_metrics=None, model_provider=run_data.evaluation_model_provider, model_name=run_data.evaluation_model, - node_run_result=node_run_result, + node_run_result_list=node_run_result_list, ) - else: - default_metric.score = 0 if customized_metrics: runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session) runner.run( @@ -138,8 +154,8 @@ def _execute_evaluation_runner( target_type=run_data.target_type, default_metric=None, customized_metrics=customized_metrics, - node_run_result=None, - node_run_result_mapping=node_run_result_mapping, + node_run_result_list=None, + node_run_result_mapping_list=node_run_result_mapping_list, ) def _create_runner(