fix: evaluation (#35802)

Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: hj24 <huangjian@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
2026-05-06 18:27:19 +08:00 · 2026-05-05 22:04:59 +08:00 · 2026-05-05 22:04:59 +08:00 · fca92cc1d1
commit fca92cc1d1
parent 63f4eb227d
1 changed files with 82 additions and 14 deletions
--- a/api/services/evaluation_service.py
+++ b/api/services/evaluation_service.py
@ -14,6 +14,7 @@ from configs import dify_config
 from core.evaluation.entities.evaluation_entity import (
    METRIC_NODE_TYPE_MAPPING,
    METRIC_VALUE_TYPE_MAPPING,
+    CustomizedMetrics,
    DefaultMetric,
    EvaluationCategory,
    EvaluationConfigData,
@ -358,6 +359,55 @@ class EvaluationService:
            ).all()
        )

+    @classmethod
+    def resolve_run_request_config(
+        cls,
+        session: Session,
+        tenant_id: str,
+        target_type: str,
+        target_id: str,
+        run_request: EvaluationRunRequest,
+    ) -> EvaluationRunRequest:
+        """Hydrate an empty run request from the latest saved evaluation config.
+
+        Console callers may submit only ``file_id`` when they want to reuse the
+        previously saved evaluation settings. We only hydrate the request when
+        *all* config fields are absent so partially specified payloads remain
+        exactly as submitted by the caller.
+        """
+        if cls._has_inline_run_config(run_request):
+            return run_request
+
+        config = cls.get_evaluation_config(session, tenant_id, target_type, target_id)
+        if config is None:
+            raise EvaluationNotFoundError("Evaluation configuration not found. Save evaluation settings first.")
+
+        return EvaluationRunRequest(
+            file_id=run_request.file_id,
+            evaluation_model=config.evaluation_model or "",
+            evaluation_model_provider=config.evaluation_model_provider or "",
+            default_metrics=[DefaultMetric.model_validate(metric) for metric in config.default_metrics_list],
+            customized_metrics=(
+                CustomizedMetrics.model_validate(config.customized_metrics_dict)
+                if config.customized_metrics_dict
+                else None
+            ),
+            judgment_config=(
+                JudgmentConfig.model_validate(config.judgment_config_dict) if config.judgment_config_dict else None
+            ),
+        )
+
+    @staticmethod
+    def _has_inline_run_config(run_request: EvaluationRunRequest) -> bool:
+        """Return True when the request body already includes evaluation settings."""
+        return bool(
+            run_request.evaluation_model.strip()
+            or run_request.evaluation_model_provider.strip()
+            or run_request.default_metrics
+            or run_request.customized_metrics is not None
+            or run_request.judgment_config is not None
+        )
+
    # ---- Evaluation Run Management ----

    @classmethod
@ -374,14 +424,23 @@ class EvaluationService:
    ) -> EvaluationRun:
        """Validate dataset, create run record, dispatch Celery task.

-        Saves the provided parameters as the latest EvaluationConfiguration
-        before creating the run.
+        Saves the effective parameters as the latest EvaluationConfiguration
+        before creating the run. When the request body contains only ``file_id``,
+        the latest saved evaluation config for the target is reused.
        """
        # Check framework is configured
        evaluation_instance = EvaluationManager.get_evaluation_instance()
        if evaluation_instance is None:
            raise EvaluationFrameworkNotConfiguredError()

+        resolved_run_request = cls.resolve_run_request_config(
+            session=session,
+            tenant_id=tenant_id,
+            target_type=target_type,
+            target_id=target_id,
+            run_request=run_request,
+        )
+
        # Save as latest EvaluationConfiguration
        config = cls.save_evaluation_config(
            session=session,
@ -389,7 +448,7 @@ class EvaluationService:
            target_type=target_type,
            target_id=target_id,
            account_id=account_id,
-            data=run_request,
+            data=resolved_run_request,
        )

        # Check concurrent run limit
@ -416,7 +475,7 @@ class EvaluationService:
            target_id=target_id,
            evaluation_config_id=config.id,
            status=EvaluationRunStatus.PENDING,
-            dataset_file_id=run_request.file_id,
+            dataset_file_id=resolved_run_request.file_id,
            total_items=len(items),
            created_by=account_id,
        )
@ -430,11 +489,11 @@ class EvaluationService:
            tenant_id=tenant_id,
            target_type=target_type,
            target_id=target_id,
-            evaluation_model_provider=run_request.evaluation_model_provider,
-            evaluation_model=run_request.evaluation_model,
-            default_metrics=run_request.default_metrics,
-            customized_metrics=run_request.customized_metrics,
-            judgment_config=run_request.judgment_config,
+            evaluation_model_provider=resolved_run_request.evaluation_model_provider,
+            evaluation_model=resolved_run_request.evaluation_model,
+            default_metrics=resolved_run_request.default_metrics,
+            customized_metrics=resolved_run_request.customized_metrics,
+            judgment_config=resolved_run_request.judgment_config,
            input_list=items,
        )

@ -463,7 +522,8 @@ class EvaluationService:

        This lightweight path keeps the existing read flows (`logs`, `run detail`,
        and result-file download) available for app evaluations without invoking
-        the asynchronous real execution flow.
+        the asynchronous real execution flow. Requests that only provide
+        ``file_id`` reuse the latest saved evaluation config for the target.
        """
        from tasks.evaluation_task import (
            _compute_metrics_summary,
@ -472,13 +532,21 @@ class EvaluationService:
            _store_result_file,
        )

+        resolved_run_request = cls.resolve_run_request_config(
+            session=session,
+            tenant_id=tenant_id,
+            target_type=target_type,
+            target_id=target_id,
+            run_request=run_request,
+        )
+
        config = cls.save_evaluation_config(
            session=session,
            tenant_id=tenant_id,
            target_type=target_type,
            target_id=target_id,
            account_id=account_id,
-            data=run_request,
+            data=resolved_run_request,
        )

        items = cls._parse_dataset(dataset_file_content, dataset_filename)
@ -487,8 +555,8 @@ class EvaluationService:
            raise EvaluationDatasetInvalidError(f"Dataset has {len(items)} rows, max is {max_rows}.")

        now = naive_utc_now()
-        results = cls._build_stub_results(input_list=items, run_request=run_request)
-        metrics_summary = _compute_metrics_summary(results, run_request.judgment_config)
+        results = cls._build_stub_results(input_list=items, run_request=resolved_run_request)
+        metrics_summary = _compute_metrics_summary(results, resolved_run_request.judgment_config)

        evaluation_run = EvaluationRun(
            tenant_id=tenant_id,
@ -496,7 +564,7 @@ class EvaluationService:
            target_id=target_id,
            evaluation_config_id=config.id,
            status=EvaluationRunStatus.COMPLETED,
-            dataset_file_id=run_request.file_id,
+            dataset_file_id=resolved_run_request.file_id,
            total_items=len(items),
            completed_items=len(items),
            failed_items=0,