mirror of
https://github.com/langgenius/dify.git
synced 2026-05-11 23:18:39 +08:00
evaluation runtime
This commit is contained in:
parent
7251bffae1
commit
4e593df662
@ -20,9 +20,10 @@ from controllers.console.wraps import (
|
||||
edit_permission_required,
|
||||
setup_required,
|
||||
)
|
||||
from core.evaluation.entities.evaluation_entity import EvaluationCategory
|
||||
from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationRunRequest
|
||||
from core.workflow.file import helpers as file_helpers
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from libs.helper import TimestampField
|
||||
from libs.login import current_account_with_tenant, login_required
|
||||
from models import App
|
||||
@ -332,33 +333,43 @@ class EvaluationRunApi(Resource):
|
||||
"""
|
||||
Start an evaluation run.
|
||||
|
||||
Expects multipart form data with:
|
||||
- file: XLSX dataset file
|
||||
- evaluation_category: one of llm, retrieval, agent, workflow
|
||||
Expects JSON body with:
|
||||
- file_id: uploaded dataset file ID
|
||||
- evaluation_model: evaluation model name
|
||||
- evaluation_model_provider: evaluation model provider
|
||||
- default_metrics: list of default metric objects
|
||||
- customized_metrics: customized metrics object (optional)
|
||||
- judgment_config: judgment conditions config (optional)
|
||||
"""
|
||||
current_account, current_tenant_id = current_account_with_tenant()
|
||||
|
||||
# Validate file upload
|
||||
if "file" not in request.files:
|
||||
raise BadRequest("Dataset file is required.")
|
||||
file = request.files["file"]
|
||||
if not file.filename or not file.filename.endswith(".xlsx"):
|
||||
raise BadRequest("Dataset file must be an XLSX file.")
|
||||
body = request.get_json(force=True)
|
||||
if not body:
|
||||
raise BadRequest("Request body is required.")
|
||||
|
||||
# Validate and parse request body
|
||||
try:
|
||||
run_request = EvaluationRunRequest.model_validate(body)
|
||||
except Exception as e:
|
||||
raise BadRequest(f"Invalid request body: {e}")
|
||||
|
||||
# Load dataset file
|
||||
upload_file = (
|
||||
db.session.query(UploadFile)
|
||||
.filter_by(id=run_request.file_id, tenant_id=current_tenant_id)
|
||||
.first()
|
||||
)
|
||||
if not upload_file:
|
||||
raise NotFound("Dataset file not found.")
|
||||
|
||||
try:
|
||||
dataset_content = storage.load_once(upload_file.key)
|
||||
except Exception:
|
||||
raise BadRequest("Failed to read dataset file.")
|
||||
|
||||
dataset_content = file.read()
|
||||
if not dataset_content:
|
||||
raise BadRequest("Dataset file is empty.")
|
||||
|
||||
# Validate evaluation category
|
||||
category_str = request.form.get("evaluation_category", "llm")
|
||||
try:
|
||||
evaluation_category = EvaluationCategory(category_str)
|
||||
except ValueError:
|
||||
raise BadRequest(
|
||||
f"Invalid evaluation_category: {category_str}. "
|
||||
f"Must be one of: {', '.join(e.value for e in EvaluationCategory)}"
|
||||
)
|
||||
|
||||
try:
|
||||
with Session(db.engine, expire_on_commit=False) as session:
|
||||
evaluation_run = EvaluationService.start_evaluation_run(
|
||||
@ -368,7 +379,7 @@ class EvaluationRunApi(Resource):
|
||||
target_id=str(target.id),
|
||||
account_id=str(current_account.id),
|
||||
dataset_file_content=dataset_content,
|
||||
evaluation_category=evaluation_category,
|
||||
run_request=run_request,
|
||||
)
|
||||
return _serialize_evaluation_run(evaluation_run), 200
|
||||
except EvaluationFrameworkNotConfiguredError as e:
|
||||
|
||||
@ -8,9 +8,10 @@ from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentRes
|
||||
|
||||
class EvaluationCategory(StrEnum):
|
||||
LLM = "llm"
|
||||
RETRIEVAL = "retrieval"
|
||||
RETRIEVAL = "knowledge_retrieval"
|
||||
AGENT = "agent"
|
||||
WORKFLOW = "workflow"
|
||||
RETRIEVAL_TEST = "retrieval_test"
|
||||
|
||||
|
||||
class EvaluationMetric(BaseModel):
|
||||
@ -42,6 +43,38 @@ class EvaluationItemResult(BaseModel):
|
||||
return sum(scores) / len(scores)
|
||||
|
||||
|
||||
class NodeInfo(BaseModel):
|
||||
node_id: str
|
||||
type: str
|
||||
title: str
|
||||
|
||||
|
||||
class DefaultMetric(BaseModel):
|
||||
metric: str
|
||||
node_info_list: list[NodeInfo]
|
||||
|
||||
|
||||
class CustomizedMetricOutputField(BaseModel):
|
||||
variable: str
|
||||
value_type: str
|
||||
|
||||
|
||||
class CustomizedMetrics(BaseModel):
|
||||
evaluation_workflow_id: str
|
||||
input_fields: dict[str, str]
|
||||
output_fields: list[CustomizedMetricOutputField]
|
||||
|
||||
|
||||
class EvaluationRunRequest(BaseModel):
|
||||
"""Request body for starting an evaluation run."""
|
||||
file_id: str
|
||||
evaluation_model: str = ""
|
||||
evaluation_model_provider: str = ""
|
||||
default_metrics: list[DefaultMetric] = Field(default_factory=list)
|
||||
customized_metrics: CustomizedMetrics | None = None
|
||||
judgment_config: JudgmentConfig | None = None
|
||||
|
||||
|
||||
class EvaluationRunData(BaseModel):
|
||||
"""Serializable data for Celery task."""
|
||||
evaluation_run_id: str
|
||||
|
||||
@ -10,9 +10,11 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from configs import dify_config
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
DefaultMetric,
|
||||
EvaluationCategory,
|
||||
EvaluationItemInput,
|
||||
EvaluationRunData,
|
||||
EvaluationRunRequest,
|
||||
)
|
||||
from core.evaluation.evaluation_manager import EvaluationManager
|
||||
from models.evaluation import (
|
||||
@ -255,18 +257,44 @@ class EvaluationService:
|
||||
target_id: str,
|
||||
account_id: str,
|
||||
dataset_file_content: bytes,
|
||||
evaluation_category: EvaluationCategory,
|
||||
run_request: EvaluationRunRequest,
|
||||
) -> EvaluationRun:
|
||||
"""Validate dataset, create run record, dispatch Celery task."""
|
||||
"""Validate dataset, create run record, dispatch Celery task.
|
||||
|
||||
Saves the provided parameters as the latest EvaluationConfiguration
|
||||
before creating the run.
|
||||
"""
|
||||
# Check framework is configured
|
||||
evaluation_instance = EvaluationManager.get_evaluation_instance()
|
||||
if evaluation_instance is None:
|
||||
raise EvaluationFrameworkNotConfiguredError()
|
||||
|
||||
# Check evaluation config exists
|
||||
config = cls.get_evaluation_config(session, tenant_id, target_type, target_id)
|
||||
if config is None:
|
||||
raise EvaluationNotFoundError("Evaluation configuration not found. Please configure evaluation first.")
|
||||
# Derive evaluation_category from default_metrics node types
|
||||
evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics)
|
||||
|
||||
# Build metrics_config from default_metrics and customized_metrics
|
||||
metrics_config: dict[str, Any] = {
|
||||
"default_metrics": [m.model_dump() for m in run_request.default_metrics],
|
||||
}
|
||||
if run_request.customized_metrics is not None:
|
||||
metrics_config["customized_metrics"] = run_request.customized_metrics.model_dump()
|
||||
|
||||
# Save as latest EvaluationConfiguration
|
||||
config = cls.save_evaluation_config(
|
||||
session=session,
|
||||
tenant_id=tenant_id,
|
||||
target_type=target_type,
|
||||
target_id=target_id,
|
||||
account_id=account_id,
|
||||
data={
|
||||
"evaluation_model_provider": run_request.evaluation_model_provider,
|
||||
"evaluation_model": run_request.evaluation_model,
|
||||
"metrics_config": metrics_config,
|
||||
"judgement_conditions": (
|
||||
run_request.judgment_config.model_dump() if run_request.judgment_config else {}
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
# Check concurrent run limit
|
||||
active_runs = (
|
||||
@ -308,9 +336,10 @@ class EvaluationService:
|
||||
target_type=target_type,
|
||||
target_id=target_id,
|
||||
evaluation_category=evaluation_category,
|
||||
evaluation_model_provider=config.evaluation_model_provider or "",
|
||||
evaluation_model=config.evaluation_model or "",
|
||||
metrics_config=config.metrics_config_dict,
|
||||
evaluation_model_provider=run_request.evaluation_model_provider,
|
||||
evaluation_model=run_request.evaluation_model,
|
||||
metrics_config=metrics_config,
|
||||
judgment_config=run_request.judgment_config,
|
||||
items=items,
|
||||
)
|
||||
|
||||
@ -406,6 +435,23 @@ class EvaluationService:
|
||||
def get_supported_metrics(cls, category: EvaluationCategory) -> list[str]:
|
||||
return EvaluationManager.get_supported_metrics(category)
|
||||
|
||||
# ---- Category Resolution ----
|
||||
|
||||
@classmethod
|
||||
def _resolve_evaluation_category(cls, default_metrics: list[DefaultMetric]) -> EvaluationCategory:
|
||||
"""Derive evaluation category from default_metrics node_info types.
|
||||
|
||||
Uses the type of the first node_info found in default_metrics.
|
||||
Falls back to LLM if no metrics are provided.
|
||||
"""
|
||||
for metric in default_metrics:
|
||||
for node_info in metric.node_info_list:
|
||||
try:
|
||||
return EvaluationCategory(node_info.type)
|
||||
except ValueError:
|
||||
continue
|
||||
return EvaluationCategory.LLM
|
||||
|
||||
# ---- Dataset Parsing ----
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from configs import dify_config
|
||||
from models.model import UploadFile
|
||||
from typing import Any
|
||||
|
||||
from celery import shared_task
|
||||
@ -279,20 +281,16 @@ def _store_result_file(
|
||||
"""Store result XLSX file and return the UploadFile ID."""
|
||||
try:
|
||||
from extensions.ext_storage import storage
|
||||
from models.model import UploadFile
|
||||
|
||||
from libs.uuid_utils import uuidv7
|
||||
|
||||
file_id = str(uuidv7())
|
||||
filename = f"evaluation-result-{run_id[:8]}.xlsx"
|
||||
storage_key = f"evaluation_results/{tenant_id}/{file_id}.xlsx"
|
||||
storage_key = f"evaluation_results/{tenant_id}/{str(uuidv7())}.xlsx"
|
||||
|
||||
storage.save(storage_key, xlsx_content)
|
||||
|
||||
upload_file = UploadFile(
|
||||
id=file_id,
|
||||
upload_file: UploadFile = UploadFile(
|
||||
tenant_id=tenant_id,
|
||||
storage_type="evaluation_result",
|
||||
storage_type=dify_config.STORAGE_TYPE,
|
||||
key=storage_key,
|
||||
name=filename,
|
||||
size=len(xlsx_content),
|
||||
@ -300,10 +298,12 @@ def _store_result_file(
|
||||
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
created_by_role="account",
|
||||
created_by="system",
|
||||
created_at=naive_utc_now(),
|
||||
used=False,
|
||||
)
|
||||
session.add(upload_file)
|
||||
session.commit()
|
||||
return file_id
|
||||
return upload_file.id
|
||||
except Exception:
|
||||
logger.exception("Failed to store result file for run %s", run_id)
|
||||
return None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user