feat: evaluation batch test (#35800)

Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Yansong Zhang <916125788@qq.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
Co-authored-by: hj24 <huangjian@dify.ai>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com>
Co-authored-by: CodingOnStar <hanxujiang@dify.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
FFXN 2026-05-05 21:04:58 +08:00 committed by GitHub
parent e1e17d8a51
commit a55171a80c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 22 deletions

View File

@ -24,7 +24,7 @@ RUN apt-get update \
# Install Python dependencies (workspace members under providers/vdb/)
COPY pyproject.toml uv.lock ./
COPY providers ./providers
RUN uv sync --locked --no-dev
RUN uv sync --locked --no-dev --group evaluation
# production stage
FROM base AS production

View File

@ -187,7 +187,17 @@ evaluation_default_metrics_response_model = console_ns.model(
evaluation_dataset_columns_response_model = console_ns.model(
"EvaluationDatasetColumnsResponse",
{
"columns": fields.List(fields.String),
"columns": fields.List(
fields.Nested(
console_ns.model(
"EvaluationTemplateColumn",
{
"name": fields.String,
"type": fields.String,
},
)
)
),
},
)
@ -388,7 +398,7 @@ class EvaluationTemplateColumnsApi(Resource):
@account_initialization_required
@get_evaluation_target
def post(self, target: Union[App, CustomizedSnippet], target_type: str):
"""Return the dataset column names implied by the current evaluation config."""
"""Return the dataset template columns implied by the current evaluation config."""
body = request.get_json(silent=True) or {}
try:
config_data = EvaluationConfigData.model_validate(body)
@ -441,7 +451,7 @@ class EvaluationLogsApi(Resource):
}
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run")
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run1")
class EvaluationRunApi(Resource):
@console_ns.doc("start_evaluation_run")
@console_ns.response(200, "Evaluation run started")
@ -502,7 +512,7 @@ class EvaluationRunApi(Resource):
return {"message": str(e.description)}, 400
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run1")
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run")
class EvaluationRunRealApi(Resource):
@console_ns.doc("start_evaluation_run_real")
@console_ns.response(200, "Evaluation run started")

View File

@ -3,7 +3,7 @@ import io
import json
import logging
from collections.abc import Mapping
from typing import Any, Union
from typing import Any, TypedDict, Union
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
@ -53,6 +53,11 @@ from services.workflow_service import WorkflowService
logger = logging.getLogger(__name__)
class EvaluationTemplateColumn(TypedDict):
name: str
type: str
class EvaluationService:
"""
Service for evaluation-related operations.
@ -456,9 +461,9 @@ class EvaluationService:
) -> EvaluationRun:
"""Persist a completed synthetic run for frontend integration testing.
This temporary path keeps the existing read flows (`logs`, `run detail`,
and result-file download) working for app evaluations while the real
execution logic is moved to `/evaluation/run1` for backend iteration.
This lightweight path keeps the existing read flows (`logs`, `run detail`,
and result-file download) available for app evaluations without invoking
the asynchronous real execution flow.
"""
from tasks.evaluation_task import (
_compute_metrics_summary,
@ -670,18 +675,18 @@ class EvaluationService:
target: Union[App, CustomizedSnippet],
target_type: str,
data: EvaluationConfigData,
) -> list[str]:
"""Build dataset column names from target inputs and the selected evaluation config."""
input_columns = cls._get_target_input_column_names(target, target_type)
expected_output_columns = cls._get_expected_output_column_names(data.default_metrics)
return ["index", *input_columns, *expected_output_columns]
) -> list[EvaluationTemplateColumn]:
"""Build dataset template columns from target inputs and the selected evaluation config."""
input_columns = cls._get_target_input_columns(target, target_type)
expected_output_columns = cls._get_expected_output_columns(data.default_metrics)
return [{"name": "index", "type": "number"}, *input_columns, *expected_output_columns]
@classmethod
def _get_target_input_column_names(
def _get_target_input_columns(
cls,
target: Union[App, CustomizedSnippet],
target_type: str,
) -> list[str]:
) -> list[EvaluationTemplateColumn]:
"""Resolve user-input variables for the target in workflow order."""
if target_type == EvaluationTargetType.APPS.value and isinstance(target, App):
input_fields = cls._get_app_input_fields(target)
@ -690,23 +695,30 @@ class EvaluationService:
else:
raise ValueError(f"Unsupported target type: {target_type}")
columns: list[str] = []
columns: list[EvaluationTemplateColumn] = []
seen: set[str] = set()
for field in input_fields:
column_name = str(field.get("variable") or field.get("label") or "").strip()
if not column_name or column_name in seen:
continue
seen.add(column_name)
columns.append(column_name)
columns.append(
{
"name": column_name,
"type": cls._normalize_template_column_type(
field.get("type") or field.get("value_type") or field.get("input_type")
),
}
)
return columns
@classmethod
def _get_expected_output_column_names(
def _get_expected_output_columns(
cls,
default_metrics: list[DefaultMetric | Mapping[str, Any]],
) -> list[str]:
) -> list[EvaluationTemplateColumn]:
"""Build one expected_output column per visible node that needs a reference answer."""
columns: list[str] = []
columns: list[EvaluationTemplateColumn] = []
seen: set[str] = set()
for metric in cls.filter_console_default_metrics(default_metrics):
if metric.metric not in cls.METRICS_REQUIRING_EXPECTED_OUTPUT:
@ -717,9 +729,20 @@ class EvaluationService:
if column_name in seen:
continue
seen.add(column_name)
columns.append(column_name)
columns.append({"name": column_name, "type": "string"})
return columns
@staticmethod
def _normalize_template_column_type(raw_type: Any) -> str:
normalized = str(raw_type or "").strip().lower()
if normalized in {"number", "integer", "float", "int"}:
return "number"
if normalized in {"bool", "boolean", "switch"}:
return "boolean"
if normalized in {"file", "files"}:
return "file"
return "string"
@classmethod
def _nodes_for_metrics_from_workflow(
cls,