From 4e593df662e54d7b23a1b55a9d2bc31c1a61d5e1 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Wed, 4 Mar 2026 18:43:58 +0800 Subject: [PATCH 1/2] evaluation runtime --- .../console/evaluation/evaluation.py | 55 +++++++++------- .../evaluation/entities/evaluation_entity.py | 35 +++++++++- api/services/evaluation_service.py | 64 ++++++++++++++++--- api/tasks/evaluation_task.py | 16 ++--- 4 files changed, 130 insertions(+), 40 deletions(-) diff --git a/api/controllers/console/evaluation/evaluation.py b/api/controllers/console/evaluation/evaluation.py index 89acac1013..50f5834e39 100644 --- a/api/controllers/console/evaluation/evaluation.py +++ b/api/controllers/console/evaluation/evaluation.py @@ -20,9 +20,10 @@ from controllers.console.wraps import ( edit_permission_required, setup_required, ) -from core.evaluation.entities.evaluation_entity import EvaluationCategory +from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationRunRequest from core.workflow.file import helpers as file_helpers from extensions.ext_database import db +from extensions.ext_storage import storage from libs.helper import TimestampField from libs.login import current_account_with_tenant, login_required from models import App @@ -332,33 +333,43 @@ class EvaluationRunApi(Resource): """ Start an evaluation run. - Expects multipart form data with: - - file: XLSX dataset file - - evaluation_category: one of llm, retrieval, agent, workflow + Expects JSON body with: + - file_id: uploaded dataset file ID + - evaluation_model: evaluation model name + - evaluation_model_provider: evaluation model provider + - default_metrics: list of default metric objects + - customized_metrics: customized metrics object (optional) + - judgment_config: judgment conditions config (optional) """ current_account, current_tenant_id = current_account_with_tenant() - # Validate file upload - if "file" not in request.files: - raise BadRequest("Dataset file is required.") - file = request.files["file"] - if not file.filename or not file.filename.endswith(".xlsx"): - raise BadRequest("Dataset file must be an XLSX file.") + body = request.get_json(force=True) + if not body: + raise BadRequest("Request body is required.") + + # Validate and parse request body + try: + run_request = EvaluationRunRequest.model_validate(body) + except Exception as e: + raise BadRequest(f"Invalid request body: {e}") + + # Load dataset file + upload_file = ( + db.session.query(UploadFile) + .filter_by(id=run_request.file_id, tenant_id=current_tenant_id) + .first() + ) + if not upload_file: + raise NotFound("Dataset file not found.") + + try: + dataset_content = storage.load_once(upload_file.key) + except Exception: + raise BadRequest("Failed to read dataset file.") - dataset_content = file.read() if not dataset_content: raise BadRequest("Dataset file is empty.") - # Validate evaluation category - category_str = request.form.get("evaluation_category", "llm") - try: - evaluation_category = EvaluationCategory(category_str) - except ValueError: - raise BadRequest( - f"Invalid evaluation_category: {category_str}. " - f"Must be one of: {', '.join(e.value for e in EvaluationCategory)}" - ) - try: with Session(db.engine, expire_on_commit=False) as session: evaluation_run = EvaluationService.start_evaluation_run( @@ -368,7 +379,7 @@ class EvaluationRunApi(Resource): target_id=str(target.id), account_id=str(current_account.id), dataset_file_content=dataset_content, - evaluation_category=evaluation_category, + run_request=run_request, ) return _serialize_evaluation_run(evaluation_run), 200 except EvaluationFrameworkNotConfiguredError as e: diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 026fde642e..1f873706a6 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -8,9 +8,10 @@ from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentRes class EvaluationCategory(StrEnum): LLM = "llm" - RETRIEVAL = "retrieval" + RETRIEVAL = "knowledge_retrieval" AGENT = "agent" WORKFLOW = "workflow" + RETRIEVAL_TEST = "retrieval_test" class EvaluationMetric(BaseModel): @@ -42,6 +43,38 @@ class EvaluationItemResult(BaseModel): return sum(scores) / len(scores) +class NodeInfo(BaseModel): + node_id: str + type: str + title: str + + +class DefaultMetric(BaseModel): + metric: str + node_info_list: list[NodeInfo] + + +class CustomizedMetricOutputField(BaseModel): + variable: str + value_type: str + + +class CustomizedMetrics(BaseModel): + evaluation_workflow_id: str + input_fields: dict[str, str] + output_fields: list[CustomizedMetricOutputField] + + +class EvaluationRunRequest(BaseModel): + """Request body for starting an evaluation run.""" + file_id: str + evaluation_model: str = "" + evaluation_model_provider: str = "" + default_metrics: list[DefaultMetric] = Field(default_factory=list) + customized_metrics: CustomizedMetrics | None = None + judgment_config: JudgmentConfig | None = None + + class EvaluationRunData(BaseModel): """Serializable data for Celery task.""" evaluation_run_id: str diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py index 3c55595fbe..8cff8cd3b9 100644 --- a/api/services/evaluation_service.py +++ b/api/services/evaluation_service.py @@ -10,9 +10,11 @@ from sqlalchemy.orm import Session from configs import dify_config from core.evaluation.entities.evaluation_entity import ( + DefaultMetric, EvaluationCategory, EvaluationItemInput, EvaluationRunData, + EvaluationRunRequest, ) from core.evaluation.evaluation_manager import EvaluationManager from models.evaluation import ( @@ -255,18 +257,44 @@ class EvaluationService: target_id: str, account_id: str, dataset_file_content: bytes, - evaluation_category: EvaluationCategory, + run_request: EvaluationRunRequest, ) -> EvaluationRun: - """Validate dataset, create run record, dispatch Celery task.""" + """Validate dataset, create run record, dispatch Celery task. + + Saves the provided parameters as the latest EvaluationConfiguration + before creating the run. + """ # Check framework is configured evaluation_instance = EvaluationManager.get_evaluation_instance() if evaluation_instance is None: raise EvaluationFrameworkNotConfiguredError() - # Check evaluation config exists - config = cls.get_evaluation_config(session, tenant_id, target_type, target_id) - if config is None: - raise EvaluationNotFoundError("Evaluation configuration not found. Please configure evaluation first.") + # Derive evaluation_category from default_metrics node types + evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics) + + # Build metrics_config from default_metrics and customized_metrics + metrics_config: dict[str, Any] = { + "default_metrics": [m.model_dump() for m in run_request.default_metrics], + } + if run_request.customized_metrics is not None: + metrics_config["customized_metrics"] = run_request.customized_metrics.model_dump() + + # Save as latest EvaluationConfiguration + config = cls.save_evaluation_config( + session=session, + tenant_id=tenant_id, + target_type=target_type, + target_id=target_id, + account_id=account_id, + data={ + "evaluation_model_provider": run_request.evaluation_model_provider, + "evaluation_model": run_request.evaluation_model, + "metrics_config": metrics_config, + "judgement_conditions": ( + run_request.judgment_config.model_dump() if run_request.judgment_config else {} + ), + }, + ) # Check concurrent run limit active_runs = ( @@ -308,9 +336,10 @@ class EvaluationService: target_type=target_type, target_id=target_id, evaluation_category=evaluation_category, - evaluation_model_provider=config.evaluation_model_provider or "", - evaluation_model=config.evaluation_model or "", - metrics_config=config.metrics_config_dict, + evaluation_model_provider=run_request.evaluation_model_provider, + evaluation_model=run_request.evaluation_model, + metrics_config=metrics_config, + judgment_config=run_request.judgment_config, items=items, ) @@ -406,6 +435,23 @@ class EvaluationService: def get_supported_metrics(cls, category: EvaluationCategory) -> list[str]: return EvaluationManager.get_supported_metrics(category) + # ---- Category Resolution ---- + + @classmethod + def _resolve_evaluation_category(cls, default_metrics: list[DefaultMetric]) -> EvaluationCategory: + """Derive evaluation category from default_metrics node_info types. + + Uses the type of the first node_info found in default_metrics. + Falls back to LLM if no metrics are provided. + """ + for metric in default_metrics: + for node_info in metric.node_info_list: + try: + return EvaluationCategory(node_info.type) + except ValueError: + continue + return EvaluationCategory.LLM + # ---- Dataset Parsing ---- @classmethod diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 8bfd9cbf87..c6daeae9ed 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -1,6 +1,8 @@ import io import json import logging +from configs import dify_config +from models.model import UploadFile from typing import Any from celery import shared_task @@ -279,20 +281,16 @@ def _store_result_file( """Store result XLSX file and return the UploadFile ID.""" try: from extensions.ext_storage import storage - from models.model import UploadFile - from libs.uuid_utils import uuidv7 - file_id = str(uuidv7()) filename = f"evaluation-result-{run_id[:8]}.xlsx" - storage_key = f"evaluation_results/{tenant_id}/{file_id}.xlsx" + storage_key = f"evaluation_results/{tenant_id}/{str(uuidv7())}.xlsx" storage.save(storage_key, xlsx_content) - upload_file = UploadFile( - id=file_id, + upload_file: UploadFile = UploadFile( tenant_id=tenant_id, - storage_type="evaluation_result", + storage_type=dify_config.STORAGE_TYPE, key=storage_key, name=filename, size=len(xlsx_content), @@ -300,10 +298,12 @@ def _store_result_file( mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", created_by_role="account", created_by="system", + created_at=naive_utc_now(), + used=False, ) session.add(upload_file) session.commit() - return file_id + return upload_file.id except Exception: logger.exception("Failed to store result file for run %s", run_id) return None From 13c0d6eddb827f9ff2720b0b3e1574d4727e1507 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Wed, 4 Mar 2026 19:20:08 +0800 Subject: [PATCH 2/2] evaluation runtime --- .../console/evaluation/evaluation.py | 11 ++++-- .../evaluation/entities/evaluation_entity.py | 13 +++++-- api/services/evaluation_service.py | 37 ++++++++----------- api/tasks/evaluation_task.py | 3 +- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/api/controllers/console/evaluation/evaluation.py b/api/controllers/console/evaluation/evaluation.py index 50f5834e39..1b0445728d 100644 --- a/api/controllers/console/evaluation/evaluation.py +++ b/api/controllers/console/evaluation/evaluation.py @@ -20,7 +20,7 @@ from controllers.console.wraps import ( edit_permission_required, setup_required, ) -from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationRunRequest +from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationConfigData, EvaluationRunRequest from core.workflow.file import helpers as file_helpers from extensions.ext_database import db from extensions.ext_storage import storage @@ -261,7 +261,12 @@ class EvaluationDetailApi(Resource): Save evaluation configuration for the target. """ current_account, current_tenant_id = current_account_with_tenant() - data = request.get_json(force=True) + body = request.get_json(force=True) + + try: + config_data = EvaluationConfigData.model_validate(body) + except Exception as e: + raise BadRequest(f"Invalid request body: {e}") with Session(db.engine, expire_on_commit=False) as session: config = EvaluationService.save_evaluation_config( @@ -270,7 +275,7 @@ class EvaluationDetailApi(Resource): target_type=target_type, target_id=str(target.id), account_id=str(current_account.id), - data=data, + data=config_data, ) return { diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 1f873706a6..96a2cf461b 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -65,9 +65,8 @@ class CustomizedMetrics(BaseModel): output_fields: list[CustomizedMetricOutputField] -class EvaluationRunRequest(BaseModel): - """Request body for starting an evaluation run.""" - file_id: str +class EvaluationConfigData(BaseModel): + """Structured data for saving evaluation configuration.""" evaluation_model: str = "" evaluation_model_provider: str = "" default_metrics: list[DefaultMetric] = Field(default_factory=list) @@ -75,6 +74,11 @@ class EvaluationRunRequest(BaseModel): judgment_config: JudgmentConfig | None = None +class EvaluationRunRequest(EvaluationConfigData): + """Request body for starting an evaluation run.""" + file_id: str + + class EvaluationRunData(BaseModel): """Serializable data for Celery task.""" evaluation_run_id: str @@ -84,6 +88,7 @@ class EvaluationRunData(BaseModel): evaluation_category: EvaluationCategory evaluation_model_provider: str evaluation_model: str - metrics_config: dict[str, Any] = Field(default_factory=dict) + default_metrics: list[dict[str, Any]] = Field(default_factory=list) + customized_metrics: dict[str, Any] | None = None judgment_config: JudgmentConfig | None = None items: list[EvaluationItemInput] diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py index 8cff8cd3b9..40b3217e3b 100644 --- a/api/services/evaluation_service.py +++ b/api/services/evaluation_service.py @@ -12,6 +12,7 @@ from configs import dify_config from core.evaluation.entities.evaluation_entity import ( DefaultMetric, EvaluationCategory, + EvaluationConfigData, EvaluationItemInput, EvaluationRunData, EvaluationRunRequest, @@ -224,7 +225,7 @@ class EvaluationService: target_type: str, target_id: str, account_id: str, - data: dict[str, Any], + data: EvaluationConfigData, ) -> EvaluationConfiguration: config = cls.get_evaluation_config(session, tenant_id, target_type, target_id) if config is None: @@ -237,10 +238,15 @@ class EvaluationService: ) session.add(config) - config.evaluation_model_provider = data.get("evaluation_model_provider") - config.evaluation_model = data.get("evaluation_model") - config.metrics_config = json.dumps(data.get("metrics_config", {})) - config.judgement_conditions = json.dumps(data.get("judgement_conditions", {})) + config.evaluation_model_provider = data.evaluation_model_provider + config.evaluation_model = data.evaluation_model + config.metrics_config = json.dumps({ + "default_metrics": [m.model_dump() for m in data.default_metrics], + "customized_metrics": data.customized_metrics.model_dump() if data.customized_metrics else None, + }) + config.judgement_conditions = json.dumps( + data.judgment_config.model_dump() if data.judgment_config else {} + ) config.updated_by = account_id session.commit() session.refresh(config) @@ -272,13 +278,6 @@ class EvaluationService: # Derive evaluation_category from default_metrics node types evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics) - # Build metrics_config from default_metrics and customized_metrics - metrics_config: dict[str, Any] = { - "default_metrics": [m.model_dump() for m in run_request.default_metrics], - } - if run_request.customized_metrics is not None: - metrics_config["customized_metrics"] = run_request.customized_metrics.model_dump() - # Save as latest EvaluationConfiguration config = cls.save_evaluation_config( session=session, @@ -286,14 +285,7 @@ class EvaluationService: target_type=target_type, target_id=target_id, account_id=account_id, - data={ - "evaluation_model_provider": run_request.evaluation_model_provider, - "evaluation_model": run_request.evaluation_model, - "metrics_config": metrics_config, - "judgement_conditions": ( - run_request.judgment_config.model_dump() if run_request.judgment_config else {} - ), - }, + data=run_request, ) # Check concurrent run limit @@ -338,7 +330,10 @@ class EvaluationService: evaluation_category=evaluation_category, evaluation_model_provider=run_request.evaluation_model_provider, evaluation_model=run_request.evaluation_model, - metrics_config=metrics_config, + default_metrics=[m.model_dump() for m in run_request.default_metrics], + customized_metrics=( + run_request.customized_metrics.model_dump() if run_request.customized_metrics else None + ), judgment_config=run_request.judgment_config, items=items, ) diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index c6daeae9ed..b74c4fae46 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -82,7 +82,8 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: target_id=run_data.target_id, target_type=run_data.target_type, items=run_data.items, - metrics_config=run_data.metrics_config, + default_metrics=run_data.default_metrics, + customized_metrics=run_data.customized_metrics, model_provider=run_data.evaluation_model_provider, model_name=run_data.evaluation_model, )