diff --git a/api/controllers/console/evaluation/evaluation.py b/api/controllers/console/evaluation/evaluation.py index 89acac1013..50f5834e39 100644 --- a/api/controllers/console/evaluation/evaluation.py +++ b/api/controllers/console/evaluation/evaluation.py @@ -20,9 +20,10 @@ from controllers.console.wraps import ( edit_permission_required, setup_required, ) -from core.evaluation.entities.evaluation_entity import EvaluationCategory +from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationRunRequest from core.workflow.file import helpers as file_helpers from extensions.ext_database import db +from extensions.ext_storage import storage from libs.helper import TimestampField from libs.login import current_account_with_tenant, login_required from models import App @@ -332,33 +333,43 @@ class EvaluationRunApi(Resource): """ Start an evaluation run. - Expects multipart form data with: - - file: XLSX dataset file - - evaluation_category: one of llm, retrieval, agent, workflow + Expects JSON body with: + - file_id: uploaded dataset file ID + - evaluation_model: evaluation model name + - evaluation_model_provider: evaluation model provider + - default_metrics: list of default metric objects + - customized_metrics: customized metrics object (optional) + - judgment_config: judgment conditions config (optional) """ current_account, current_tenant_id = current_account_with_tenant() - # Validate file upload - if "file" not in request.files: - raise BadRequest("Dataset file is required.") - file = request.files["file"] - if not file.filename or not file.filename.endswith(".xlsx"): - raise BadRequest("Dataset file must be an XLSX file.") + body = request.get_json(force=True) + if not body: + raise BadRequest("Request body is required.") + + # Validate and parse request body + try: + run_request = EvaluationRunRequest.model_validate(body) + except Exception as e: + raise BadRequest(f"Invalid request body: {e}") + + # Load dataset file + upload_file = ( + db.session.query(UploadFile) + .filter_by(id=run_request.file_id, tenant_id=current_tenant_id) + .first() + ) + if not upload_file: + raise NotFound("Dataset file not found.") + + try: + dataset_content = storage.load_once(upload_file.key) + except Exception: + raise BadRequest("Failed to read dataset file.") - dataset_content = file.read() if not dataset_content: raise BadRequest("Dataset file is empty.") - # Validate evaluation category - category_str = request.form.get("evaluation_category", "llm") - try: - evaluation_category = EvaluationCategory(category_str) - except ValueError: - raise BadRequest( - f"Invalid evaluation_category: {category_str}. " - f"Must be one of: {', '.join(e.value for e in EvaluationCategory)}" - ) - try: with Session(db.engine, expire_on_commit=False) as session: evaluation_run = EvaluationService.start_evaluation_run( @@ -368,7 +379,7 @@ class EvaluationRunApi(Resource): target_id=str(target.id), account_id=str(current_account.id), dataset_file_content=dataset_content, - evaluation_category=evaluation_category, + run_request=run_request, ) return _serialize_evaluation_run(evaluation_run), 200 except EvaluationFrameworkNotConfiguredError as e: diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 026fde642e..1f873706a6 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -8,9 +8,10 @@ from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentRes class EvaluationCategory(StrEnum): LLM = "llm" - RETRIEVAL = "retrieval" + RETRIEVAL = "knowledge_retrieval" AGENT = "agent" WORKFLOW = "workflow" + RETRIEVAL_TEST = "retrieval_test" class EvaluationMetric(BaseModel): @@ -42,6 +43,38 @@ class EvaluationItemResult(BaseModel): return sum(scores) / len(scores) +class NodeInfo(BaseModel): + node_id: str + type: str + title: str + + +class DefaultMetric(BaseModel): + metric: str + node_info_list: list[NodeInfo] + + +class CustomizedMetricOutputField(BaseModel): + variable: str + value_type: str + + +class CustomizedMetrics(BaseModel): + evaluation_workflow_id: str + input_fields: dict[str, str] + output_fields: list[CustomizedMetricOutputField] + + +class EvaluationRunRequest(BaseModel): + """Request body for starting an evaluation run.""" + file_id: str + evaluation_model: str = "" + evaluation_model_provider: str = "" + default_metrics: list[DefaultMetric] = Field(default_factory=list) + customized_metrics: CustomizedMetrics | None = None + judgment_config: JudgmentConfig | None = None + + class EvaluationRunData(BaseModel): """Serializable data for Celery task.""" evaluation_run_id: str diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py index 3c55595fbe..8cff8cd3b9 100644 --- a/api/services/evaluation_service.py +++ b/api/services/evaluation_service.py @@ -10,9 +10,11 @@ from sqlalchemy.orm import Session from configs import dify_config from core.evaluation.entities.evaluation_entity import ( + DefaultMetric, EvaluationCategory, EvaluationItemInput, EvaluationRunData, + EvaluationRunRequest, ) from core.evaluation.evaluation_manager import EvaluationManager from models.evaluation import ( @@ -255,18 +257,44 @@ class EvaluationService: target_id: str, account_id: str, dataset_file_content: bytes, - evaluation_category: EvaluationCategory, + run_request: EvaluationRunRequest, ) -> EvaluationRun: - """Validate dataset, create run record, dispatch Celery task.""" + """Validate dataset, create run record, dispatch Celery task. + + Saves the provided parameters as the latest EvaluationConfiguration + before creating the run. + """ # Check framework is configured evaluation_instance = EvaluationManager.get_evaluation_instance() if evaluation_instance is None: raise EvaluationFrameworkNotConfiguredError() - # Check evaluation config exists - config = cls.get_evaluation_config(session, tenant_id, target_type, target_id) - if config is None: - raise EvaluationNotFoundError("Evaluation configuration not found. Please configure evaluation first.") + # Derive evaluation_category from default_metrics node types + evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics) + + # Build metrics_config from default_metrics and customized_metrics + metrics_config: dict[str, Any] = { + "default_metrics": [m.model_dump() for m in run_request.default_metrics], + } + if run_request.customized_metrics is not None: + metrics_config["customized_metrics"] = run_request.customized_metrics.model_dump() + + # Save as latest EvaluationConfiguration + config = cls.save_evaluation_config( + session=session, + tenant_id=tenant_id, + target_type=target_type, + target_id=target_id, + account_id=account_id, + data={ + "evaluation_model_provider": run_request.evaluation_model_provider, + "evaluation_model": run_request.evaluation_model, + "metrics_config": metrics_config, + "judgement_conditions": ( + run_request.judgment_config.model_dump() if run_request.judgment_config else {} + ), + }, + ) # Check concurrent run limit active_runs = ( @@ -308,9 +336,10 @@ class EvaluationService: target_type=target_type, target_id=target_id, evaluation_category=evaluation_category, - evaluation_model_provider=config.evaluation_model_provider or "", - evaluation_model=config.evaluation_model or "", - metrics_config=config.metrics_config_dict, + evaluation_model_provider=run_request.evaluation_model_provider, + evaluation_model=run_request.evaluation_model, + metrics_config=metrics_config, + judgment_config=run_request.judgment_config, items=items, ) @@ -406,6 +435,23 @@ class EvaluationService: def get_supported_metrics(cls, category: EvaluationCategory) -> list[str]: return EvaluationManager.get_supported_metrics(category) + # ---- Category Resolution ---- + + @classmethod + def _resolve_evaluation_category(cls, default_metrics: list[DefaultMetric]) -> EvaluationCategory: + """Derive evaluation category from default_metrics node_info types. + + Uses the type of the first node_info found in default_metrics. + Falls back to LLM if no metrics are provided. + """ + for metric in default_metrics: + for node_info in metric.node_info_list: + try: + return EvaluationCategory(node_info.type) + except ValueError: + continue + return EvaluationCategory.LLM + # ---- Dataset Parsing ---- @classmethod diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 8bfd9cbf87..c6daeae9ed 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -1,6 +1,8 @@ import io import json import logging +from configs import dify_config +from models.model import UploadFile from typing import Any from celery import shared_task @@ -279,20 +281,16 @@ def _store_result_file( """Store result XLSX file and return the UploadFile ID.""" try: from extensions.ext_storage import storage - from models.model import UploadFile - from libs.uuid_utils import uuidv7 - file_id = str(uuidv7()) filename = f"evaluation-result-{run_id[:8]}.xlsx" - storage_key = f"evaluation_results/{tenant_id}/{file_id}.xlsx" + storage_key = f"evaluation_results/{tenant_id}/{str(uuidv7())}.xlsx" storage.save(storage_key, xlsx_content) - upload_file = UploadFile( - id=file_id, + upload_file: UploadFile = UploadFile( tenant_id=tenant_id, - storage_type="evaluation_result", + storage_type=dify_config.STORAGE_TYPE, key=storage_key, name=filename, size=len(xlsx_content), @@ -300,10 +298,12 @@ def _store_result_file( mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", created_by_role="account", created_by="system", + created_at=naive_utc_now(), + used=False, ) session.add(upload_file) session.commit() - return file_id + return upload_file.id except Exception: logger.exception("Failed to store result file for run %s", run_id) return None