mirror of
https://github.com/langgenius/dify.git
synced 2026-05-11 23:18:39 +08:00
evaluation runtime
This commit is contained in:
parent
6c0c9a2f5b
commit
7a065b3f42
@ -24,6 +24,7 @@ class EvaluationMetric(BaseModel):
|
||||
class EvaluationItemInput(BaseModel):
|
||||
index: int
|
||||
inputs: dict[str, Any]
|
||||
output: str
|
||||
expected_output: str | None = None
|
||||
context: list[str] | None = None
|
||||
|
||||
@ -89,4 +90,4 @@ class EvaluationRunData(BaseModel):
|
||||
default_metrics: list[DefaultMetric] = Field(default_factory=list)
|
||||
customized_metrics: CustomizedMetrics | None = None
|
||||
judgment_config: JudgmentConfig | None = None
|
||||
input_list: list[dict]
|
||||
input_list: list[EvaluationItemInput]
|
||||
|
||||
@ -42,8 +42,8 @@ class BaseEvaluationRunner(ABC):
|
||||
@abstractmethod
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None,
|
||||
node_run_result_list: list[NodeRunResult] | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
@ -59,12 +59,12 @@ class BaseEvaluationRunner(ABC):
|
||||
tenant_id: str,
|
||||
target_id: str,
|
||||
target_type: str,
|
||||
node_run_result: NodeRunResult | None = None,
|
||||
node_run_result_list: list[NodeRunResult] | None = None,
|
||||
default_metric: DefaultMetric | None = None,
|
||||
customized_metrics: CustomizedMetrics | None = None,
|
||||
model_provider: str = "",
|
||||
model_name: str = "",
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None = None,
|
||||
node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None = None,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Orchestrate target execution + metric evaluation + judgment for all items."""
|
||||
evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first()
|
||||
@ -82,11 +82,11 @@ class BaseEvaluationRunner(ABC):
|
||||
results: list[EvaluationItemResult] = []
|
||||
|
||||
# Phase 1: run evaluation
|
||||
if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED:
|
||||
if default_metric and node_run_result_list:
|
||||
try:
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
node_run_result_mapping=node_run_result_mapping,
|
||||
node_run_result=node_run_result,
|
||||
node_run_result_mapping_list=node_run_result_mapping_list,
|
||||
node_run_result_list=node_run_result_list,
|
||||
default_metric=default_metric,
|
||||
customized_metrics=customized_metrics,
|
||||
model_provider=model_provider,
|
||||
@ -100,6 +100,19 @@ class BaseEvaluationRunner(ABC):
|
||||
results[i] = evaluated_by_index[result.index]
|
||||
except Exception:
|
||||
logger.exception("Failed to compute metrics for evaluation run %s", evaluation_run_id)
|
||||
if customized_metrics and node_run_result_mapping_list:
|
||||
try:
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
node_run_result_mapping_list=node_run_result_mapping_list,
|
||||
node_run_result_list=node_run_result_list,
|
||||
default_metric=default_metric,
|
||||
customized_metrics=customized_metrics,
|
||||
model_provider=model_provider,
|
||||
model_name=model_name,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to compute metrics for evaluation run %s", evaluation_run_id)
|
||||
|
||||
# Phase 4: Persist individual items
|
||||
for result in results:
|
||||
|
||||
@ -27,6 +27,7 @@ from extensions.ext_database import db
|
||||
from libs.datetime_utils import naive_utc_now
|
||||
from models.evaluation import EvaluationRun, EvaluationRunStatus
|
||||
from models.model import UploadFile
|
||||
from services.evaluation_service import EvaluationService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -76,7 +77,20 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
||||
if evaluation_instance is None:
|
||||
raise ValueError("Evaluation framework not configured")
|
||||
|
||||
_execute_evaluation_runner(session, run_data, evaluation_instance, node_run_result_mapping)
|
||||
evaluation_service = EvaluationService()
|
||||
node_run_result_mapping_list: list[dict[str, NodeRunResult]] = evaluation_service.execute_targets(
|
||||
tenant_id=run_data.tenant_id,
|
||||
target_type=run_data.target_type,
|
||||
target_id=run_data.target_id,
|
||||
input_list=run_data.input_list,
|
||||
)
|
||||
|
||||
results: list[EvaluationItemResult] = _execute_evaluation_runner(
|
||||
session,
|
||||
run_data,
|
||||
evaluation_instance,
|
||||
node_run_result_mapping_list,
|
||||
)
|
||||
|
||||
|
||||
# Compute summary metrics
|
||||
@ -106,15 +120,19 @@ def _execute_evaluation_runner(
|
||||
session: Any,
|
||||
run_data: EvaluationRunData,
|
||||
evaluation_instance: BaseEvaluationInstance,
|
||||
node_run_result_mapping: dict[str, NodeRunResult],
|
||||
node_run_result_mapping_list: list[dict[str, NodeRunResult]],
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Execute the evaluation runner."""
|
||||
default_metrics = run_data.default_metrics
|
||||
customized_metrics = run_data.customized_metrics
|
||||
for default_metric in default_metrics:
|
||||
for node_info in default_metric.node_info_list:
|
||||
node_run_result = node_run_result_mapping.get(node_info.node_id)
|
||||
if node_run_result:
|
||||
node_run_result_list: list[NodeRunResult] = []
|
||||
for node_run_result_mapping in node_run_result_mapping_list:
|
||||
node_run_result = node_run_result_mapping.get(node_info.node_id)
|
||||
if node_run_result is not None:
|
||||
node_run_result_list.append(node_run_result)
|
||||
if node_run_result_list:
|
||||
runner = _create_runner(EvaluationCategory(node_info.type), evaluation_instance, session)
|
||||
runner.run(
|
||||
evaluation_run_id=run_data.evaluation_run_id,
|
||||
@ -125,10 +143,8 @@ def _execute_evaluation_runner(
|
||||
customized_metrics=None,
|
||||
model_provider=run_data.evaluation_model_provider,
|
||||
model_name=run_data.evaluation_model,
|
||||
node_run_result=node_run_result,
|
||||
node_run_result_list=node_run_result_list,
|
||||
)
|
||||
else:
|
||||
default_metric.score = 0
|
||||
if customized_metrics:
|
||||
runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session)
|
||||
runner.run(
|
||||
@ -138,8 +154,8 @@ def _execute_evaluation_runner(
|
||||
target_type=run_data.target_type,
|
||||
default_metric=None,
|
||||
customized_metrics=customized_metrics,
|
||||
node_run_result=None,
|
||||
node_run_result_mapping=node_run_result_mapping,
|
||||
node_run_result_list=None,
|
||||
node_run_result_mapping_list=node_run_result_mapping_list,
|
||||
)
|
||||
|
||||
def _create_runner(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user