diff --git a/api/controllers/console/evaluation/evaluation.py b/api/controllers/console/evaluation/evaluation.py index 89acac1013..1b0445728d 100644 --- a/api/controllers/console/evaluation/evaluation.py +++ b/api/controllers/console/evaluation/evaluation.py @@ -20,9 +20,10 @@ from controllers.console.wraps import ( edit_permission_required, setup_required, ) -from core.evaluation.entities.evaluation_entity import EvaluationCategory +from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationConfigData, EvaluationRunRequest from core.workflow.file import helpers as file_helpers from extensions.ext_database import db +from extensions.ext_storage import storage from libs.helper import TimestampField from libs.login import current_account_with_tenant, login_required from models import App @@ -260,7 +261,12 @@ class EvaluationDetailApi(Resource): Save evaluation configuration for the target. """ current_account, current_tenant_id = current_account_with_tenant() - data = request.get_json(force=True) + body = request.get_json(force=True) + + try: + config_data = EvaluationConfigData.model_validate(body) + except Exception as e: + raise BadRequest(f"Invalid request body: {e}") with Session(db.engine, expire_on_commit=False) as session: config = EvaluationService.save_evaluation_config( @@ -269,7 +275,7 @@ class EvaluationDetailApi(Resource): target_type=target_type, target_id=str(target.id), account_id=str(current_account.id), - data=data, + data=config_data, ) return { @@ -332,33 +338,43 @@ class EvaluationRunApi(Resource): """ Start an evaluation run. - Expects multipart form data with: - - file: XLSX dataset file - - evaluation_category: one of llm, retrieval, agent, workflow + Expects JSON body with: + - file_id: uploaded dataset file ID + - evaluation_model: evaluation model name + - evaluation_model_provider: evaluation model provider + - default_metrics: list of default metric objects + - customized_metrics: customized metrics object (optional) + - judgment_config: judgment conditions config (optional) """ current_account, current_tenant_id = current_account_with_tenant() - # Validate file upload - if "file" not in request.files: - raise BadRequest("Dataset file is required.") - file = request.files["file"] - if not file.filename or not file.filename.endswith(".xlsx"): - raise BadRequest("Dataset file must be an XLSX file.") + body = request.get_json(force=True) + if not body: + raise BadRequest("Request body is required.") + + # Validate and parse request body + try: + run_request = EvaluationRunRequest.model_validate(body) + except Exception as e: + raise BadRequest(f"Invalid request body: {e}") + + # Load dataset file + upload_file = ( + db.session.query(UploadFile) + .filter_by(id=run_request.file_id, tenant_id=current_tenant_id) + .first() + ) + if not upload_file: + raise NotFound("Dataset file not found.") + + try: + dataset_content = storage.load_once(upload_file.key) + except Exception: + raise BadRequest("Failed to read dataset file.") - dataset_content = file.read() if not dataset_content: raise BadRequest("Dataset file is empty.") - # Validate evaluation category - category_str = request.form.get("evaluation_category", "llm") - try: - evaluation_category = EvaluationCategory(category_str) - except ValueError: - raise BadRequest( - f"Invalid evaluation_category: {category_str}. " - f"Must be one of: {', '.join(e.value for e in EvaluationCategory)}" - ) - try: with Session(db.engine, expire_on_commit=False) as session: evaluation_run = EvaluationService.start_evaluation_run( @@ -368,7 +384,7 @@ class EvaluationRunApi(Resource): target_id=str(target.id), account_id=str(current_account.id), dataset_file_content=dataset_content, - evaluation_category=evaluation_category, + run_request=run_request, ) return _serialize_evaluation_run(evaluation_run), 200 except EvaluationFrameworkNotConfiguredError as e: diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 026fde642e..96a2cf461b 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -8,9 +8,10 @@ from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentRes class EvaluationCategory(StrEnum): LLM = "llm" - RETRIEVAL = "retrieval" + RETRIEVAL = "knowledge_retrieval" AGENT = "agent" WORKFLOW = "workflow" + RETRIEVAL_TEST = "retrieval_test" class EvaluationMetric(BaseModel): @@ -42,6 +43,42 @@ class EvaluationItemResult(BaseModel): return sum(scores) / len(scores) +class NodeInfo(BaseModel): + node_id: str + type: str + title: str + + +class DefaultMetric(BaseModel): + metric: str + node_info_list: list[NodeInfo] + + +class CustomizedMetricOutputField(BaseModel): + variable: str + value_type: str + + +class CustomizedMetrics(BaseModel): + evaluation_workflow_id: str + input_fields: dict[str, str] + output_fields: list[CustomizedMetricOutputField] + + +class EvaluationConfigData(BaseModel): + """Structured data for saving evaluation configuration.""" + evaluation_model: str = "" + evaluation_model_provider: str = "" + default_metrics: list[DefaultMetric] = Field(default_factory=list) + customized_metrics: CustomizedMetrics | None = None + judgment_config: JudgmentConfig | None = None + + +class EvaluationRunRequest(EvaluationConfigData): + """Request body for starting an evaluation run.""" + file_id: str + + class EvaluationRunData(BaseModel): """Serializable data for Celery task.""" evaluation_run_id: str @@ -51,6 +88,7 @@ class EvaluationRunData(BaseModel): evaluation_category: EvaluationCategory evaluation_model_provider: str evaluation_model: str - metrics_config: dict[str, Any] = Field(default_factory=dict) + default_metrics: list[dict[str, Any]] = Field(default_factory=list) + customized_metrics: dict[str, Any] | None = None judgment_config: JudgmentConfig | None = None items: list[EvaluationItemInput] diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py index 3c55595fbe..40b3217e3b 100644 --- a/api/services/evaluation_service.py +++ b/api/services/evaluation_service.py @@ -10,9 +10,12 @@ from sqlalchemy.orm import Session from configs import dify_config from core.evaluation.entities.evaluation_entity import ( + DefaultMetric, EvaluationCategory, + EvaluationConfigData, EvaluationItemInput, EvaluationRunData, + EvaluationRunRequest, ) from core.evaluation.evaluation_manager import EvaluationManager from models.evaluation import ( @@ -222,7 +225,7 @@ class EvaluationService: target_type: str, target_id: str, account_id: str, - data: dict[str, Any], + data: EvaluationConfigData, ) -> EvaluationConfiguration: config = cls.get_evaluation_config(session, tenant_id, target_type, target_id) if config is None: @@ -235,10 +238,15 @@ class EvaluationService: ) session.add(config) - config.evaluation_model_provider = data.get("evaluation_model_provider") - config.evaluation_model = data.get("evaluation_model") - config.metrics_config = json.dumps(data.get("metrics_config", {})) - config.judgement_conditions = json.dumps(data.get("judgement_conditions", {})) + config.evaluation_model_provider = data.evaluation_model_provider + config.evaluation_model = data.evaluation_model + config.metrics_config = json.dumps({ + "default_metrics": [m.model_dump() for m in data.default_metrics], + "customized_metrics": data.customized_metrics.model_dump() if data.customized_metrics else None, + }) + config.judgement_conditions = json.dumps( + data.judgment_config.model_dump() if data.judgment_config else {} + ) config.updated_by = account_id session.commit() session.refresh(config) @@ -255,18 +263,30 @@ class EvaluationService: target_id: str, account_id: str, dataset_file_content: bytes, - evaluation_category: EvaluationCategory, + run_request: EvaluationRunRequest, ) -> EvaluationRun: - """Validate dataset, create run record, dispatch Celery task.""" + """Validate dataset, create run record, dispatch Celery task. + + Saves the provided parameters as the latest EvaluationConfiguration + before creating the run. + """ # Check framework is configured evaluation_instance = EvaluationManager.get_evaluation_instance() if evaluation_instance is None: raise EvaluationFrameworkNotConfiguredError() - # Check evaluation config exists - config = cls.get_evaluation_config(session, tenant_id, target_type, target_id) - if config is None: - raise EvaluationNotFoundError("Evaluation configuration not found. Please configure evaluation first.") + # Derive evaluation_category from default_metrics node types + evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics) + + # Save as latest EvaluationConfiguration + config = cls.save_evaluation_config( + session=session, + tenant_id=tenant_id, + target_type=target_type, + target_id=target_id, + account_id=account_id, + data=run_request, + ) # Check concurrent run limit active_runs = ( @@ -308,9 +328,13 @@ class EvaluationService: target_type=target_type, target_id=target_id, evaluation_category=evaluation_category, - evaluation_model_provider=config.evaluation_model_provider or "", - evaluation_model=config.evaluation_model or "", - metrics_config=config.metrics_config_dict, + evaluation_model_provider=run_request.evaluation_model_provider, + evaluation_model=run_request.evaluation_model, + default_metrics=[m.model_dump() for m in run_request.default_metrics], + customized_metrics=( + run_request.customized_metrics.model_dump() if run_request.customized_metrics else None + ), + judgment_config=run_request.judgment_config, items=items, ) @@ -406,6 +430,23 @@ class EvaluationService: def get_supported_metrics(cls, category: EvaluationCategory) -> list[str]: return EvaluationManager.get_supported_metrics(category) + # ---- Category Resolution ---- + + @classmethod + def _resolve_evaluation_category(cls, default_metrics: list[DefaultMetric]) -> EvaluationCategory: + """Derive evaluation category from default_metrics node_info types. + + Uses the type of the first node_info found in default_metrics. + Falls back to LLM if no metrics are provided. + """ + for metric in default_metrics: + for node_info in metric.node_info_list: + try: + return EvaluationCategory(node_info.type) + except ValueError: + continue + return EvaluationCategory.LLM + # ---- Dataset Parsing ---- @classmethod diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 8bfd9cbf87..b74c4fae46 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -1,6 +1,8 @@ import io import json import logging +from configs import dify_config +from models.model import UploadFile from typing import Any from celery import shared_task @@ -80,7 +82,8 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: target_id=run_data.target_id, target_type=run_data.target_type, items=run_data.items, - metrics_config=run_data.metrics_config, + default_metrics=run_data.default_metrics, + customized_metrics=run_data.customized_metrics, model_provider=run_data.evaluation_model_provider, model_name=run_data.evaluation_model, ) @@ -279,20 +282,16 @@ def _store_result_file( """Store result XLSX file and return the UploadFile ID.""" try: from extensions.ext_storage import storage - from models.model import UploadFile - from libs.uuid_utils import uuidv7 - file_id = str(uuidv7()) filename = f"evaluation-result-{run_id[:8]}.xlsx" - storage_key = f"evaluation_results/{tenant_id}/{file_id}.xlsx" + storage_key = f"evaluation_results/{tenant_id}/{str(uuidv7())}.xlsx" storage.save(storage_key, xlsx_content) - upload_file = UploadFile( - id=file_id, + upload_file: UploadFile = UploadFile( tenant_id=tenant_id, - storage_type="evaluation_result", + storage_type=dify_config.STORAGE_TYPE, key=storage_key, name=filename, size=len(xlsx_content), @@ -300,10 +299,12 @@ def _store_result_file( mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", created_by_role="account", created_by="system", + created_at=naive_utc_now(), + used=False, ) session.add(upload_file) session.commit() - return file_id + return upload_file.id except Exception: logger.exception("Failed to store result file for run %s", run_id) return None