diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 63fe180899..fcecd9d469 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -41,14 +41,6 @@ class EvaluationItemResult(BaseModel): metrics: list[EvaluationMetric] = Field(default_factory=list) error: str | None = None - @property - def overall_score(self) -> float | None: - if not self.metrics: - return None - scores = [m.score for m in self.metrics] - return sum(scores) / len(scores) - - class NodeInfo(BaseModel): node_id: str type: str diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py index 101d794c20..02046c62cb 100644 --- a/api/services/evaluation_service.py +++ b/api/services/evaluation_service.py @@ -15,7 +15,6 @@ from core.evaluation.entities.evaluation_entity import ( EvaluationCategory, EvaluationConfigData, EvaluationDatasetInput, - EvaluationItemInput, EvaluationRunData, EvaluationRunRequest, ) @@ -156,6 +155,8 @@ class EvaluationService: """ wb = Workbook() ws = wb.active + if ws is None: + ws = wb.create_sheet("Evaluation Dataset") sheet_name = "Evaluation Dataset" ws.title = sheet_name @@ -174,7 +175,7 @@ class EvaluationService: headers = ["index"] for field in input_fields: - field_label = field.get("label") or field.get("variable") + field_label = str(field.get("label") or field.get("variable") or "") headers.append(field_label) # Write header row @@ -279,9 +280,6 @@ class EvaluationService: if evaluation_instance is None: raise EvaluationFrameworkNotConfiguredError() - # Derive evaluation_category from default_metrics node types - evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics) - # Save as latest EvaluationConfiguration config = cls.save_evaluation_config( session=session, @@ -333,12 +331,10 @@ class EvaluationService: target_id=target_id, evaluation_model_provider=run_request.evaluation_model_provider, evaluation_model=run_request.evaluation_model, - default_metrics=[m.model_dump() for m in run_request.default_metrics], - customized_metrics=( - run_request.customized_metrics.model_dump() if run_request.customized_metrics else None - ), + default_metrics=run_request.default_metrics, + customized_metrics=run_request.customized_metrics, judgment_config=run_request.judgment_config, - items=items, + input_list=items, ) # Dispatch Celery task @@ -648,7 +644,7 @@ class EvaluationService: # ---- Dataset Parsing ---- @classmethod - def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationItemInput]: + def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationDatasetInput]: """Parse evaluation dataset from XLSX bytes.""" wb = load_workbook(io.BytesIO(xlsx_content), read_only=True) ws = wb.active @@ -672,7 +668,7 @@ class EvaluationService: index_val = values[0] if values else row_idx try: - index = int(index_val) + index = int(str(index_val)) except (TypeError, ValueError): index = row_idx @@ -681,17 +677,14 @@ class EvaluationService: val = values[col_idx + 1] if col_idx + 1 < len(values) else None inputs[header] = str(val) if val is not None else "" - # Check for expected_output column + # Extract expected_output column into dedicated field expected_output = inputs.pop("expected_output", None) - context_str = inputs.pop("context", None) - context = context_str.split(";") if context_str else None items.append( - EvaluationItemInput( + EvaluationDatasetInput( index=index, inputs=inputs, expected_output=expected_output, - context=context, ) ) diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 8c08c09c7b..742509ff53 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -25,6 +25,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio from core.workflow.node_events.base import NodeRunResult from extensions.ext_database import db from libs.datetime_utils import naive_utc_now +from models.enums import CreatorUserRole from models.evaluation import EvaluationRun, EvaluationRunStatus from models.model import UploadFile from services.evaluation_service import EvaluationService @@ -116,6 +117,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: logger.info("Evaluation run %s completed successfully", run_data.evaluation_run_id) + def _execute_evaluation_runner( session: Any, run_data: EvaluationRunData, @@ -125,6 +127,7 @@ def _execute_evaluation_runner( """Execute the evaluation runner.""" default_metrics = run_data.default_metrics customized_metrics = run_data.customized_metrics + results: list[EvaluationItemResult] = [] for default_metric in default_metrics: for node_info in default_metric.node_info_list: node_run_result_list: list[NodeRunResult] = [] @@ -134,7 +137,7 @@ def _execute_evaluation_runner( node_run_result_list.append(node_run_result) if node_run_result_list: runner = _create_runner(EvaluationCategory(node_info.type), evaluation_instance, session) - runner.run( + results.extend(runner.run( evaluation_run_id=run_data.evaluation_run_id, tenant_id=run_data.tenant_id, target_id=run_data.target_id, @@ -144,10 +147,10 @@ def _execute_evaluation_runner( model_provider=run_data.evaluation_model_provider, model_name=run_data.evaluation_model, node_run_result_list=node_run_result_list, - ) + )) if customized_metrics: runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session) - runner.run( + results.extend(runner.run( evaluation_run_id=run_data.evaluation_run_id, tenant_id=run_data.tenant_id, target_id=run_data.target_id, @@ -156,7 +159,9 @@ def _execute_evaluation_runner( customized_metrics=customized_metrics, node_run_result_list=None, node_run_result_mapping_list=node_run_result_mapping_list, - ) + )) + return results + def _create_runner( category: EvaluationCategory, @@ -201,7 +206,7 @@ def _compute_metrics_summary(results: list[EvaluationItemResult]) -> dict[str, A for metric in result.metrics: if metric.name not in metric_scores: metric_scores[metric.name] = [] - metric_scores[metric.name].append(metric.score) + metric_scores[metric.name].append(float(metric.value)) summary: dict[str, Any] = {} for name, scores in metric_scores.items(): @@ -231,6 +236,8 @@ def _generate_result_xlsx( """Generate result XLSX with input data, actual output, and metric scores.""" wb = Workbook() ws = wb.active + if ws is None: + ws = wb.create_sheet("Evaluation Results") ws.title = "Evaluation Results" header_font = Font(bold=True, color="FFFFFF") @@ -306,7 +313,7 @@ def _generate_result_xlsx( col += 1 # Metric scores - metric_scores = {m.name: m.score for m in result.metrics} if result else {} + metric_scores = {m.name: m.value for m in result.metrics} if result else {} for metric_name in all_metric_names: score = metric_scores.get(metric_name) ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border @@ -351,7 +358,7 @@ def _store_result_file( size=len(xlsx_content), extension="xlsx", mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - created_by_role="account", + created_by_role=CreatorUserRole.ACCOUNT, created_by="system", created_at=naive_utc_now(), used=False,