fix: evaluation (#35728)

Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Yansong Zhang <916125788@qq.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
Co-authored-by: hj24 <huangjian@dify.ai>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com>
Co-authored-by: CodingOnStar <hanxujiang@dify.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
FFXN 2026-04-30 15:32:01 +08:00 committed by GitHub
parent 3373b63716
commit 22653b7464
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 66 additions and 7 deletions

View File

@ -234,7 +234,7 @@ def get_evaluation_target(view_func: Callable[P, R]):
return decorated_view
def _load_evaluation_run_request_and_dataset(tenant_id: str) -> tuple[EvaluationRunRequest, bytes]:
def _load_evaluation_run_request_and_dataset(tenant_id: str) -> tuple[EvaluationRunRequest, bytes, str]:
"""Validate the run payload and load the uploaded dataset bytes."""
body = request.get_json(force=True)
if not body:
@ -257,7 +257,7 @@ def _load_evaluation_run_request_and_dataset(tenant_id: str) -> tuple[Evaluation
if not dataset_content:
raise BadRequest("Dataset file is empty.")
return run_request, dataset_content
return run_request, dataset_content, upload_file.name
@console_ns.route("/<string:evaluate_target_type>/<uuid:evaluate_target_id>/dataset-template/download")
@ -434,7 +434,7 @@ class EvaluationRunApi(Resource):
- judgment_config: judgment conditions config (optional)
"""
current_account, current_tenant_id = current_account_with_tenant()
run_request, dataset_content = _load_evaluation_run_request_and_dataset(current_tenant_id)
run_request, dataset_content, dataset_filename = _load_evaluation_run_request_and_dataset(current_tenant_id)
try:
with Session(db.engine, expire_on_commit=False) as session:
@ -446,6 +446,7 @@ class EvaluationRunApi(Resource):
target_id=str(target.id),
account_id=str(current_account.id),
dataset_file_content=dataset_content,
dataset_filename=dataset_filename,
run_request=run_request,
)
else:
@ -456,6 +457,7 @@ class EvaluationRunApi(Resource):
target_id=str(target.id),
account_id=str(current_account.id),
dataset_file_content=dataset_content,
dataset_filename=dataset_filename,
run_request=run_request,
)
return _serialize_evaluation_run(evaluation_run), 200
@ -483,7 +485,7 @@ class EvaluationRunRealApi(Resource):
def post(self, target: Union[App, CustomizedSnippet, Dataset], target_type: str):
"""Start the real evaluation execution flow on the temporary dev path."""
current_account, current_tenant_id = current_account_with_tenant()
run_request, dataset_content = _load_evaluation_run_request_and_dataset(current_tenant_id)
run_request, dataset_content, dataset_filename = _load_evaluation_run_request_and_dataset(current_tenant_id)
try:
with Session(db.engine, expire_on_commit=False) as session:
@ -494,6 +496,7 @@ class EvaluationRunRealApi(Resource):
target_id=str(target.id),
account_id=str(current_account.id),
dataset_file_content=dataset_content,
dataset_filename=dataset_filename,
run_request=run_request,
)
return _serialize_evaluation_run(evaluation_run), 200

View File

@ -1,6 +1,7 @@
import io
import json
import logging
import csv
from collections.abc import Mapping
from typing import Any, Union
@ -352,6 +353,7 @@ class EvaluationService:
target_id: str,
account_id: str,
dataset_file_content: bytes,
dataset_filename: str,
run_request: EvaluationRunRequest,
) -> EvaluationRun:
"""Validate dataset, create run record, dispatch Celery task.
@ -386,7 +388,7 @@ class EvaluationService:
raise EvaluationMaxConcurrentRunsError(f"Maximum concurrent runs ({max_concurrent}) reached.")
# Parse dataset
items = cls._parse_dataset(dataset_file_content)
items = cls._parse_dataset(dataset_file_content, dataset_filename)
max_rows = dify_config.EVALUATION_MAX_DATASET_ROWS
if len(items) > max_rows:
raise EvaluationDatasetInvalidError(f"Dataset has {len(items)} rows, max is {max_rows}.")
@ -437,6 +439,7 @@ class EvaluationService:
target_id: str,
account_id: str,
dataset_file_content: bytes,
dataset_filename: str,
run_request: EvaluationRunRequest,
) -> EvaluationRun:
"""Persist a completed synthetic run for frontend integration testing.
@ -461,7 +464,7 @@ class EvaluationService:
data=run_request,
)
items = cls._parse_dataset(dataset_file_content)
items = cls._parse_dataset(dataset_file_content, dataset_filename)
max_rows = dify_config.EVALUATION_MAX_DATASET_ROWS
if len(items) > max_rows:
raise EvaluationDatasetInvalidError(f"Dataset has {len(items)} rows, max is {max_rows}.")
@ -932,7 +935,15 @@ class EvaluationService:
# ---- Dataset Parsing ----
@classmethod
def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationDatasetInput]:
def _parse_dataset(cls, file_content: bytes, filename: str) -> list[EvaluationDatasetInput]:
"""Parse evaluation dataset from CSV or XLSX content."""
filename_lower = filename.lower()
if filename_lower.endswith(".csv"):
return cls._parse_csv_dataset(file_content)
return cls._parse_xlsx_dataset(file_content)
@classmethod
def _parse_xlsx_dataset(cls, xlsx_content: bytes) -> list[EvaluationDatasetInput]:
"""Parse evaluation dataset from XLSX bytes."""
wb = load_workbook(io.BytesIO(xlsx_content), read_only=True)
ws = wb.active
@ -979,6 +990,51 @@ class EvaluationService:
wb.close()
return items
@classmethod
def _parse_csv_dataset(cls, csv_content: bytes) -> list[EvaluationDatasetInput]:
"""Parse evaluation dataset from UTF-8 CSV bytes.
CSV follows the same schema as XLSX:
the first column must be `index`, remaining columns become inputs,
and `expected_output` is extracted into a dedicated field.
"""
try:
decoded = csv_content.decode("utf-8-sig")
except UnicodeDecodeError as e:
raise EvaluationDatasetInvalidError("CSV file must be UTF-8 encoded.") from e
reader = csv.reader(io.StringIO(decoded))
rows = list(reader)
if len(rows) < 2:
raise EvaluationDatasetInvalidError("Dataset must have at least a header row and one data row.")
headers = [str(h).strip() if h is not None else "" for h in rows[0]]
if not headers or headers[0].lower() != "index":
raise EvaluationDatasetInvalidError("First column header must be 'index'.")
input_headers = headers[1:]
items: list[EvaluationDatasetInput] = []
for row_idx, row in enumerate(rows[1:], start=1):
values = list(row)
if all(str(v).strip() == "" for v in values):
continue
index_val = values[0] if values else row_idx
try:
index = int(str(index_val))
except (TypeError, ValueError):
index = row_idx
inputs: dict[str, Any] = {}
for col_idx, header in enumerate(input_headers):
val = values[col_idx + 1] if col_idx + 1 < len(values) else None
inputs[header] = str(val) if val is not None else ""
expected_output = inputs.pop("expected_output", None)
items.append(EvaluationDatasetInput(index=index, inputs=inputs, expected_output=expected_output))
return items
@classmethod
def _build_stub_results(
cls,