From fca92cc1d1c4929fada2b44ec68f468bc0e72d50 Mon Sep 17 00:00:00 2001
From: FFXN <31929997+FFXN@users.noreply.github.com>
Date: Tue, 5 May 2026 22:04:59 +0800
Subject: [PATCH] fix: evaluation (#35802)

Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Yansong Zhang <916125788@qq.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
Co-authored-by: hj24 <huangjian@dify.ai>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com>
Co-authored-by: CodingOnStar <hanxujiang@dify.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
---
 api/services/evaluation_service.py | 96 +++++++++++++++++++++++++-----
 1 file changed, 82 insertions(+), 14 deletions(-)

diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py
index 72538d831e..a51be1a650 100644
--- a/api/services/evaluation_service.py
+++ b/api/services/evaluation_service.py
@@ -14,6 +14,7 @@ from configs import dify_config
 from core.evaluation.entities.evaluation_entity import (
     METRIC_NODE_TYPE_MAPPING,
     METRIC_VALUE_TYPE_MAPPING,
+    CustomizedMetrics,
     DefaultMetric,
     EvaluationCategory,
     EvaluationConfigData,
@@ -358,6 +359,55 @@ class EvaluationService:
             ).all()
         )
 
+    @classmethod
+    def resolve_run_request_config(
+        cls,
+        session: Session,
+        tenant_id: str,
+        target_type: str,
+        target_id: str,
+        run_request: EvaluationRunRequest,
+    ) -> EvaluationRunRequest:
+        """Hydrate an empty run request from the latest saved evaluation config.
+
+        Console callers may submit only ``file_id`` when they want to reuse the
+        previously saved evaluation settings. We only hydrate the request when
+        *all* config fields are absent so partially specified payloads remain
+        exactly as submitted by the caller.
+        """
+        if cls._has_inline_run_config(run_request):
+            return run_request
+
+        config = cls.get_evaluation_config(session, tenant_id, target_type, target_id)
+        if config is None:
+            raise EvaluationNotFoundError("Evaluation configuration not found. Save evaluation settings first.")
+
+        return EvaluationRunRequest(
+            file_id=run_request.file_id,
+            evaluation_model=config.evaluation_model or "",
+            evaluation_model_provider=config.evaluation_model_provider or "",
+            default_metrics=[DefaultMetric.model_validate(metric) for metric in config.default_metrics_list],
+            customized_metrics=(
+                CustomizedMetrics.model_validate(config.customized_metrics_dict)
+                if config.customized_metrics_dict
+                else None
+            ),
+            judgment_config=(
+                JudgmentConfig.model_validate(config.judgment_config_dict) if config.judgment_config_dict else None
+            ),
+        )
+
+    @staticmethod
+    def _has_inline_run_config(run_request: EvaluationRunRequest) -> bool:
+        """Return True when the request body already includes evaluation settings."""
+        return bool(
+            run_request.evaluation_model.strip()
+            or run_request.evaluation_model_provider.strip()
+            or run_request.default_metrics
+            or run_request.customized_metrics is not None
+            or run_request.judgment_config is not None
+        )
+
     # ---- Evaluation Run Management ----
 
     @classmethod
@@ -374,14 +424,23 @@ class EvaluationService:
     ) -> EvaluationRun:
         """Validate dataset, create run record, dispatch Celery task.
 
-        Saves the provided parameters as the latest EvaluationConfiguration
-        before creating the run.
+        Saves the effective parameters as the latest EvaluationConfiguration
+        before creating the run. When the request body contains only ``file_id``,
+        the latest saved evaluation config for the target is reused.
         """
         # Check framework is configured
         evaluation_instance = EvaluationManager.get_evaluation_instance()
         if evaluation_instance is None:
             raise EvaluationFrameworkNotConfiguredError()
 
+        resolved_run_request = cls.resolve_run_request_config(
+            session=session,
+            tenant_id=tenant_id,
+            target_type=target_type,
+            target_id=target_id,
+            run_request=run_request,
+        )
+
         # Save as latest EvaluationConfiguration
         config = cls.save_evaluation_config(
             session=session,
@@ -389,7 +448,7 @@ class EvaluationService:
             target_type=target_type,
             target_id=target_id,
             account_id=account_id,
-            data=run_request,
+            data=resolved_run_request,
         )
 
         # Check concurrent run limit
@@ -416,7 +475,7 @@ class EvaluationService:
             target_id=target_id,
             evaluation_config_id=config.id,
             status=EvaluationRunStatus.PENDING,
-            dataset_file_id=run_request.file_id,
+            dataset_file_id=resolved_run_request.file_id,
             total_items=len(items),
             created_by=account_id,
         )
@@ -430,11 +489,11 @@ class EvaluationService:
             tenant_id=tenant_id,
             target_type=target_type,
             target_id=target_id,
-            evaluation_model_provider=run_request.evaluation_model_provider,
-            evaluation_model=run_request.evaluation_model,
-            default_metrics=run_request.default_metrics,
-            customized_metrics=run_request.customized_metrics,
-            judgment_config=run_request.judgment_config,
+            evaluation_model_provider=resolved_run_request.evaluation_model_provider,
+            evaluation_model=resolved_run_request.evaluation_model,
+            default_metrics=resolved_run_request.default_metrics,
+            customized_metrics=resolved_run_request.customized_metrics,
+            judgment_config=resolved_run_request.judgment_config,
             input_list=items,
         )
 
@@ -463,7 +522,8 @@ class EvaluationService:
 
         This lightweight path keeps the existing read flows (`logs`, `run detail`,
         and result-file download) available for app evaluations without invoking
-        the asynchronous real execution flow.
+        the asynchronous real execution flow. Requests that only provide
+        ``file_id`` reuse the latest saved evaluation config for the target.
         """
         from tasks.evaluation_task import (
             _compute_metrics_summary,
@@ -472,13 +532,21 @@ class EvaluationService:
             _store_result_file,
         )
 
+        resolved_run_request = cls.resolve_run_request_config(
+            session=session,
+            tenant_id=tenant_id,
+            target_type=target_type,
+            target_id=target_id,
+            run_request=run_request,
+        )
+
         config = cls.save_evaluation_config(
             session=session,
             tenant_id=tenant_id,
             target_type=target_type,
             target_id=target_id,
             account_id=account_id,
-            data=run_request,
+            data=resolved_run_request,
         )
 
         items = cls._parse_dataset(dataset_file_content, dataset_filename)
@@ -487,8 +555,8 @@ class EvaluationService:
             raise EvaluationDatasetInvalidError(f"Dataset has {len(items)} rows, max is {max_rows}.")
 
         now = naive_utc_now()
-        results = cls._build_stub_results(input_list=items, run_request=run_request)
-        metrics_summary = _compute_metrics_summary(results, run_request.judgment_config)
+        results = cls._build_stub_results(input_list=items, run_request=resolved_run_request)
+        metrics_summary = _compute_metrics_summary(results, resolved_run_request.judgment_config)
 
         evaluation_run = EvaluationRun(
             tenant_id=tenant_id,
@@ -496,7 +564,7 @@ class EvaluationService:
             target_id=target_id,
             evaluation_config_id=config.id,
             status=EvaluationRunStatus.COMPLETED,
-            dataset_file_id=run_request.file_id,
+            dataset_file_id=resolved_run_request.file_id,
             total_items=len(items),
             completed_items=len(items),
             failed_items=0,