mirror of
https://github.com/langgenius/dify.git
synced 2026-05-06 18:27:19 +08:00
feat: evaluation batch test (#35800)
Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: hj24 <huangjian@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
parent
e1e17d8a51
commit
a55171a80c
@ -24,7 +24,7 @@ RUN apt-get update \
|
||||
# Install Python dependencies (workspace members under providers/vdb/)
|
||||
COPY pyproject.toml uv.lock ./
|
||||
COPY providers ./providers
|
||||
RUN uv sync --locked --no-dev
|
||||
RUN uv sync --locked --no-dev --group evaluation
|
||||
|
||||
# production stage
|
||||
FROM base AS production
|
||||
|
||||
@ -187,7 +187,17 @@ evaluation_default_metrics_response_model = console_ns.model(
|
||||
evaluation_dataset_columns_response_model = console_ns.model(
|
||||
"EvaluationDatasetColumnsResponse",
|
||||
{
|
||||
"columns": fields.List(fields.String),
|
||||
"columns": fields.List(
|
||||
fields.Nested(
|
||||
console_ns.model(
|
||||
"EvaluationTemplateColumn",
|
||||
{
|
||||
"name": fields.String,
|
||||
"type": fields.String,
|
||||
},
|
||||
)
|
||||
)
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
@ -388,7 +398,7 @@ class EvaluationTemplateColumnsApi(Resource):
|
||||
@account_initialization_required
|
||||
@get_evaluation_target
|
||||
def post(self, target: Union[App, CustomizedSnippet], target_type: str):
|
||||
"""Return the dataset column names implied by the current evaluation config."""
|
||||
"""Return the dataset template columns implied by the current evaluation config."""
|
||||
body = request.get_json(silent=True) or {}
|
||||
try:
|
||||
config_data = EvaluationConfigData.model_validate(body)
|
||||
@ -441,7 +451,7 @@ class EvaluationLogsApi(Resource):
|
||||
}
|
||||
|
||||
|
||||
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run")
|
||||
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run1")
|
||||
class EvaluationRunApi(Resource):
|
||||
@console_ns.doc("start_evaluation_run")
|
||||
@console_ns.response(200, "Evaluation run started")
|
||||
@ -502,7 +512,7 @@ class EvaluationRunApi(Resource):
|
||||
return {"message": str(e.description)}, 400
|
||||
|
||||
|
||||
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run1")
|
||||
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/evaluation/run")
|
||||
class EvaluationRunRealApi(Resource):
|
||||
@console_ns.doc("start_evaluation_run_real")
|
||||
@console_ns.response(200, "Evaluation run started")
|
||||
|
||||
@ -3,7 +3,7 @@ import io
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Union
|
||||
from typing import Any, TypedDict, Union
|
||||
|
||||
from openpyxl import Workbook, load_workbook
|
||||
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
|
||||
@ -53,6 +53,11 @@ from services.workflow_service import WorkflowService
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EvaluationTemplateColumn(TypedDict):
|
||||
name: str
|
||||
type: str
|
||||
|
||||
|
||||
class EvaluationService:
|
||||
"""
|
||||
Service for evaluation-related operations.
|
||||
@ -456,9 +461,9 @@ class EvaluationService:
|
||||
) -> EvaluationRun:
|
||||
"""Persist a completed synthetic run for frontend integration testing.
|
||||
|
||||
This temporary path keeps the existing read flows (`logs`, `run detail`,
|
||||
and result-file download) working for app evaluations while the real
|
||||
execution logic is moved to `/evaluation/run1` for backend iteration.
|
||||
This lightweight path keeps the existing read flows (`logs`, `run detail`,
|
||||
and result-file download) available for app evaluations without invoking
|
||||
the asynchronous real execution flow.
|
||||
"""
|
||||
from tasks.evaluation_task import (
|
||||
_compute_metrics_summary,
|
||||
@ -670,18 +675,18 @@ class EvaluationService:
|
||||
target: Union[App, CustomizedSnippet],
|
||||
target_type: str,
|
||||
data: EvaluationConfigData,
|
||||
) -> list[str]:
|
||||
"""Build dataset column names from target inputs and the selected evaluation config."""
|
||||
input_columns = cls._get_target_input_column_names(target, target_type)
|
||||
expected_output_columns = cls._get_expected_output_column_names(data.default_metrics)
|
||||
return ["index", *input_columns, *expected_output_columns]
|
||||
) -> list[EvaluationTemplateColumn]:
|
||||
"""Build dataset template columns from target inputs and the selected evaluation config."""
|
||||
input_columns = cls._get_target_input_columns(target, target_type)
|
||||
expected_output_columns = cls._get_expected_output_columns(data.default_metrics)
|
||||
return [{"name": "index", "type": "number"}, *input_columns, *expected_output_columns]
|
||||
|
||||
@classmethod
|
||||
def _get_target_input_column_names(
|
||||
def _get_target_input_columns(
|
||||
cls,
|
||||
target: Union[App, CustomizedSnippet],
|
||||
target_type: str,
|
||||
) -> list[str]:
|
||||
) -> list[EvaluationTemplateColumn]:
|
||||
"""Resolve user-input variables for the target in workflow order."""
|
||||
if target_type == EvaluationTargetType.APPS.value and isinstance(target, App):
|
||||
input_fields = cls._get_app_input_fields(target)
|
||||
@ -690,23 +695,30 @@ class EvaluationService:
|
||||
else:
|
||||
raise ValueError(f"Unsupported target type: {target_type}")
|
||||
|
||||
columns: list[str] = []
|
||||
columns: list[EvaluationTemplateColumn] = []
|
||||
seen: set[str] = set()
|
||||
for field in input_fields:
|
||||
column_name = str(field.get("variable") or field.get("label") or "").strip()
|
||||
if not column_name or column_name in seen:
|
||||
continue
|
||||
seen.add(column_name)
|
||||
columns.append(column_name)
|
||||
columns.append(
|
||||
{
|
||||
"name": column_name,
|
||||
"type": cls._normalize_template_column_type(
|
||||
field.get("type") or field.get("value_type") or field.get("input_type")
|
||||
),
|
||||
}
|
||||
)
|
||||
return columns
|
||||
|
||||
@classmethod
|
||||
def _get_expected_output_column_names(
|
||||
def _get_expected_output_columns(
|
||||
cls,
|
||||
default_metrics: list[DefaultMetric | Mapping[str, Any]],
|
||||
) -> list[str]:
|
||||
) -> list[EvaluationTemplateColumn]:
|
||||
"""Build one expected_output column per visible node that needs a reference answer."""
|
||||
columns: list[str] = []
|
||||
columns: list[EvaluationTemplateColumn] = []
|
||||
seen: set[str] = set()
|
||||
for metric in cls.filter_console_default_metrics(default_metrics):
|
||||
if metric.metric not in cls.METRICS_REQUIRING_EXPECTED_OUTPUT:
|
||||
@ -717,9 +729,20 @@ class EvaluationService:
|
||||
if column_name in seen:
|
||||
continue
|
||||
seen.add(column_name)
|
||||
columns.append(column_name)
|
||||
columns.append({"name": column_name, "type": "string"})
|
||||
return columns
|
||||
|
||||
@staticmethod
|
||||
def _normalize_template_column_type(raw_type: Any) -> str:
|
||||
normalized = str(raw_type or "").strip().lower()
|
||||
if normalized in {"number", "integer", "float", "int"}:
|
||||
return "number"
|
||||
if normalized in {"bool", "boolean", "switch"}:
|
||||
return "boolean"
|
||||
if normalized in {"file", "files"}:
|
||||
return "file"
|
||||
return "string"
|
||||
|
||||
@classmethod
|
||||
def _nodes_for_metrics_from_workflow(
|
||||
cls,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user