feat: evaluation batch test (#35800)

Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: hj24 <huangjian@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
2026-05-06 18:27:19 +08:00 · 2026-05-05 21:04:58 +08:00 · 2026-05-05 21:04:58 +08:00 · a55171a80c
commit a55171a80c
parent e1e17d8a51
3 changed files with 55 additions and 22 deletions
--- a/api/Dockerfile
+++ b/api/Dockerfile
@ -24,7 +24,7 @@ RUN apt-get update \
 # Install Python dependencies (workspace members under providers/vdb/)
 COPY pyproject.toml uv.lock ./
 COPY providers ./providers
-RUN uv sync --locked --no-dev
+RUN uv sync --locked --no-dev --group evaluation

 # production stage
 FROM base AS production
--- a/api/controllers/console/evaluation/evaluation.py
+++ b/api/controllers/console/evaluation/evaluation.py
@ -187,7 +187,17 @@ evaluation_default_metrics_response_model = console_ns.model(
 evaluation_dataset_columns_response_model = console_ns.model(
    "EvaluationDatasetColumnsResponse",
    {
-        "columns": fields.List(fields.String),
+        "columns": fields.List(
+            fields.Nested(
+                console_ns.model(
+                    "EvaluationTemplateColumn",
+                    {
+                        "name": fields.String,
+                        "type": fields.String,
+                    },
+                )
+            )
+        ),
    },
 )

@ -388,7 +398,7 @@ class EvaluationTemplateColumnsApi(Resource):
    @account_initialization_required
    @get_evaluation_target
    def post(self, target: Union[App, CustomizedSnippet], target_type: str):
-        """Return the dataset column names implied by the current evaluation config."""
+        """Return the dataset template columns implied by the current evaluation config."""
        body = request.get_json(silent=True) or {}
        try:
            config_data = EvaluationConfigData.model_validate(body)
@ -441,7 +451,7 @@ class EvaluationLogsApi(Resource):
        }


-@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run")
+@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run1")
 class EvaluationRunApi(Resource):
    @console_ns.doc("start_evaluation_run")
    @console_ns.response(200, "Evaluation run started")
@ -502,7 +512,7 @@ class EvaluationRunApi(Resource):
            return {"message": str(e.description)}, 400


-@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run1")
+@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run")
 class EvaluationRunRealApi(Resource):
    @console_ns.doc("start_evaluation_run_real")
    @console_ns.response(200, "Evaluation run started")
--- a/api/services/evaluation_service.py
+++ b/api/services/evaluation_service.py
@ -3,7 +3,7 @@ import io
 import json
 import logging
 from collections.abc import Mapping
-from typing import Any, Union
+from typing import Any, TypedDict, Union

 from openpyxl import Workbook, load_workbook
 from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
@ -53,6 +53,11 @@ from services.workflow_service import WorkflowService
 logger = logging.getLogger(__name__)


+class EvaluationTemplateColumn(TypedDict):
+    name: str
+    type: str
+
+
 class EvaluationService:
    """
    Service for evaluation-related operations.
@ -456,9 +461,9 @@ class EvaluationService:
    ) -> EvaluationRun:
        """Persist a completed synthetic run for frontend integration testing.

-        This temporary path keeps the existing read flows (`logs`, `run detail`,
-        and result-file download) working for app evaluations while the real
-        execution logic is moved to `/evaluation/run1` for backend iteration.
+        This lightweight path keeps the existing read flows (`logs`, `run detail`,
+        and result-file download) available for app evaluations without invoking
+        the asynchronous real execution flow.
        """
        from tasks.evaluation_task import (
            _compute_metrics_summary,
@ -670,18 +675,18 @@ class EvaluationService:
        target: Union[App, CustomizedSnippet],
        target_type: str,
        data: EvaluationConfigData,
-    ) -> list[str]:
-        """Build dataset column names from target inputs and the selected evaluation config."""
-        input_columns = cls._get_target_input_column_names(target, target_type)
-        expected_output_columns = cls._get_expected_output_column_names(data.default_metrics)
-        return ["index", *input_columns, *expected_output_columns]
+    ) -> list[EvaluationTemplateColumn]:
+        """Build dataset template columns from target inputs and the selected evaluation config."""
+        input_columns = cls._get_target_input_columns(target, target_type)
+        expected_output_columns = cls._get_expected_output_columns(data.default_metrics)
+        return [{"name": "index", "type": "number"}, *input_columns, *expected_output_columns]

    @classmethod
-    def _get_target_input_column_names(
+    def _get_target_input_columns(
        cls,
        target: Union[App, CustomizedSnippet],
        target_type: str,
-    ) -> list[str]:
+    ) -> list[EvaluationTemplateColumn]:
        """Resolve user-input variables for the target in workflow order."""
        if target_type == EvaluationTargetType.APPS.value and isinstance(target, App):
            input_fields = cls._get_app_input_fields(target)
@ -690,23 +695,30 @@ class EvaluationService:
        else:
            raise ValueError(f"Unsupported target type: {target_type}")

-        columns: list[str] = []
+        columns: list[EvaluationTemplateColumn] = []
        seen: set[str] = set()
        for field in input_fields:
            column_name = str(field.get("variable") or field.get("label") or "").strip()
            if not column_name or column_name in seen:
                continue
            seen.add(column_name)
-            columns.append(column_name)
+            columns.append(
+                {
+                    "name": column_name,
+                    "type": cls._normalize_template_column_type(
+                        field.get("type") or field.get("value_type") or field.get("input_type")
+                    ),
+                }
+            )
        return columns

    @classmethod
-    def _get_expected_output_column_names(
+    def _get_expected_output_columns(
        cls,
        default_metrics: list[DefaultMetric | Mapping[str, Any]],
-    ) -> list[str]:
+    ) -> list[EvaluationTemplateColumn]:
        """Build one expected_output column per visible node that needs a reference answer."""
-        columns: list[str] = []
+        columns: list[EvaluationTemplateColumn] = []
        seen: set[str] = set()
        for metric in cls.filter_console_default_metrics(default_metrics):
            if metric.metric not in cls.METRICS_REQUIRING_EXPECTED_OUTPUT:
@ -717,9 +729,20 @@ class EvaluationService:
                if column_name in seen:
                    continue
                seen.add(column_name)
-                columns.append(column_name)
+                columns.append({"name": column_name, "type": "string"})
        return columns

+    @staticmethod
+    def _normalize_template_column_type(raw_type: Any) -> str:
+        normalized = str(raw_type or "").strip().lower()
+        if normalized in {"number", "integer", "float", "int"}:
+            return "number"
+        if normalized in {"bool", "boolean", "switch"}:
+            return "boolean"
+        if normalized in {"file", "files"}:
+            return "file"
+        return "string"
+
    @classmethod
    def _nodes_for_metrics_from_workflow(
        cls,